diff --git a/.buildkite/ci_config.yaml b/.buildkite/ci_config.yaml new file mode 100644 index 0000000000000..199c33159fde3 --- /dev/null +++ b/.buildkite/ci_config.yaml @@ -0,0 +1,24 @@ +name: vllm_ci +job_dirs: + - ".buildkite/test_areas" + - ".buildkite/image_build" +run_all_patterns: + - "docker/Dockerfile" + - "CMakeLists.txt" + - "requirements/common.txt" + - "requirements/cuda.txt" + - "requirements/build.txt" + - "requirements/test.txt" + - "setup.py" + - "csrc/" + - "cmake/" +run_all_exclude_patterns: + - "docker/Dockerfile." + - "csrc/cpu/" + - "csrc/rocm/" + - "cmake/hipify.py" + - "cmake/cpu_extension.cmake" +registries: public.ecr.aws/q9t5s3a7 +repositories: + main: "vllm-ci-postmerge-repo" + premerge: "vllm-ci-test-repo" diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh new file mode 100755 index 0000000000000..9a2384e524b63 --- /dev/null +++ b/.buildkite/image_build/image_build.sh @@ -0,0 +1,56 @@ +#!/bin/bash +set -e + +if [[ $# -lt 8 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 +BRANCH=$4 +VLLM_USE_PRECOMPILED=$5 +VLLM_MERGE_BASE_COMMIT=$6 +CACHE_FROM=$7 +CACHE_TO=$8 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com + +# docker buildx +docker buildx create --name vllm-builder --driver docker-container --use +docker buildx inspect --bootstrap +docker buildx ls + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +if [[ "${VLLM_USE_PRECOMPILED:-0}" == "1" ]]; then + merge_base_commit_build_args="--build-arg VLLM_MERGE_BASE_COMMIT=${VLLM_MERGE_BASE_COMMIT}" +else + merge_base_commit_build_args="" +fi + +# build +docker buildx build --file docker/Dockerfile \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg USE_SCCACHE=1 \ + --build-arg TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0 10.0" \ + --build-arg FI_TORCH_CUDA_ARCH_LIST="8.0 8.9 9.0a 10.0a" \ + --build-arg VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED:-0}" \ + ${merge_base_commit_build_args} \ + --cache-from type=registry,ref=${CACHE_FROM},mode=max \ + --cache-to type=registry,ref=${CACHE_TO},mode=max \ + --tag ${REGISTRY}/${REPO}:${BUILDKITE_COMMIT} \ + $( [[ "${BRANCH}" == "main" ]] && echo "--tag ${REGISTRY}/${REPO}:latest" ) \ + --push \ + --target test \ + --progress plain . diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml new file mode 100644 index 0000000000000..d01c71dd9becf --- /dev/null +++ b/.buildkite/image_build/image_build.yaml @@ -0,0 +1,57 @@ +group: Abuild +steps: + - label: ":docker: Build image" + key: image-build + depends_on: [] + commands: + - .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $CACHE_FROM $CACHE_TO + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU image" + key: image-build-cpu + depends_on: [] + commands: + - .buildkite/image_build/image_build_cpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build HPU image" + soft_fail: true + depends_on: [] + key: image-build-hpu + commands: + - .buildkite/image_build/image_build_hpu.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 + + - label: ":docker: Build CPU arm64 image" + key: cpu-arm64-image-build + depends_on: [] + optional: true + commands: + - .buildkite/image_build/image_build_cpu_arm64.sh $REGISTRY $REPO $BUILDKITE_COMMIT + env: + DOCKER_BUILDKIT: "1" + retry: + automatic: + - exit_status: -1 # Agent was lost + limit: 2 + - exit_status: -10 # Agent was lost + limit: 2 diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh new file mode 100755 index 0000000000000..a69732f430985 --- /dev/null +++ b/.buildkite/image_build/image_build_cpu.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --build-arg VLLM_CPU_AVX512BF16=true \ + --build-arg VLLM_CPU_AVX512VNNI=true \ + --build-arg VLLM_CPU_AMXBF16=true \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh new file mode 100755 index 0000000000000..615298b6555bd --- /dev/null +++ b/.buildkite/image_build/image_build_cpu_arm64.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build --file docker/Dockerfile.cpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu \ + --target vllm-test \ + --progress plain . + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-cpu diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh new file mode 100755 index 0000000000000..192447ef4577e --- /dev/null +++ b/.buildkite/image_build/image_build_hpu.sh @@ -0,0 +1,34 @@ +#!/bin/bash +set -e + +if [[ $# -lt 3 ]]; then + echo "Usage: $0 " + exit 1 +fi + +REGISTRY=$1 +REPO=$2 +BUILDKITE_COMMIT=$3 + +# authenticate with AWS ECR +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin $REGISTRY + +# skip build if image already exists +if [[ -z $(docker manifest inspect $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi + +# build +docker build \ + --file tests/pytorch_ci_hud_benchmark/Dockerfile.hpu \ + --build-arg max_jobs=16 \ + --build-arg buildkite_commit=$BUILDKITE_COMMIT \ + --tag $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu \ + --progress plain \ + https://github.com/vllm-project/vllm-gaudi.git + +# push +docker push $REGISTRY/$REPO:$BUILDKITE_COMMIT-hpu diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index fbfc923998f89..151bb6abb0905 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -15,6 +15,21 @@ steps: env: DOCKER_BUILDKIT: "1" + - label: "Build arm64 wheel - CUDA 13.0" + depends_on: ~ + id: build-wheel-arm64-cuda-13-0 + agents: + queue: arm64_cpu_queue_postmerge + commands: + # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: + # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 + - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." + - "mkdir artifacts" + - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" + env: + DOCKER_BUILDKIT: "1" + # aarch64 build - label: "Build arm64 CPU wheel" depends_on: ~ @@ -25,7 +40,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" env: DOCKER_BUILDKIT: "1" @@ -39,7 +54,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_31" env: DOCKER_BUILDKIT: "1" @@ -52,7 +67,7 @@ steps: - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." - "mkdir artifacts" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - - "bash .buildkite/scripts/upload-wheels.sh" + - "bash .buildkite/scripts/upload-wheels.sh manylinux_2_35" env: DOCKER_BUILDKIT: "1" diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index f10cb2f0b6e21..d0965fbd56405 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -372,6 +372,17 @@ if __name__ == "__main__": print(f"Found {len(wheel_files)} wheel files for version {version}: {wheel_files}") + # keep only "official" files for a non-nightly version (specifed by cli args) + PY_VERSION_RE = re.compile(r"^\d+\.\d+\.\d+([a-zA-Z0-9.+-]*)?$") + if PY_VERSION_RE.match(version): + # upload-wheels.sh ensures no "dev" is in args.version + wheel_files = list( + filter(lambda x: version in x and "dev" not in x, wheel_files) + ) + print(f"Non-nightly version detected, wheel files used: {wheel_files}") + else: + print("Nightly version detected, keeping all wheel files.") + # Generate index and metadata, assuming wheels and indices are stored as: # s3://vllm-wheels/{version}/ # s3://vllm-wheels// diff --git a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh index b5f6b2494792f..b6274d698d01a 100755 --- a/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh +++ b/.buildkite/scripts/hardware_ci/run-cpu-test-arm.sh @@ -36,11 +36,17 @@ function cpu_tests() { set -e python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" + # Run model tests + docker exec cpu-test bash -c " + set -e + pytest -x -v -s tests/models/multimodal/generation/test_whisper.py -m cpu_model" + # Run kernel tests docker exec cpu-test bash -c " set -e pytest -x -v -s tests/kernels/test_onednn.py - pytest -x -v -s tests/kernels/attention/test_cpu_attn.py" + pytest -x -v -s tests/kernels/attention/test_cpu_attn.py + pytest -x -v -s tests/kernels/moe/test_moe.py -k test_cpu_fused_moe_basic" # basic online serving docker exec cpu-test bash -c ' diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 1d5dba3f26f5a..dfc9db512d1e9 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -47,6 +47,6 @@ docker run \ pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py pytest -v -s v1/structured_output pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py - pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py + pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_example_connector.py --ignore=v1/kv_connector/unit/test_lmcache_integration.py pytest -v -s v1/test_serial_utils.py ' diff --git a/.buildkite/scripts/run-prime-rl-test.sh b/.buildkite/scripts/run-prime-rl-test.sh index 5b25c358fc4aa..3fb7c82c8d333 100755 --- a/.buildkite/scripts/run-prime-rl-test.sh +++ b/.buildkite/scripts/run-prime-rl-test.sh @@ -12,6 +12,11 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git" PRIME_RL_DIR="${REPO_ROOT}/prime-rl" +if command -v rocm-smi &> /dev/null || command -v rocminfo &> /dev/null; then + echo "AMD GPU detected. Prime-RL currently only supports NVIDIA. Skipping..." + exit 0 +fi + echo "Setting up Prime-RL integration test environment..." # Clean up any existing Prime-RL directory diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh deleted file mode 100644 index d7167161b0059..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8030} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="deepseek-ai/DeepSeek-V2-lite" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 2 \ - --data-parallel-size 2 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 693418da6093e..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do --data-parallel-size 2 \ --enable-expert-parallel \ --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600}' \ --trust-remote-code \ --max-model-len 2048 \ --port $PORT & diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh deleted file mode 100644 index 937a43d1a3221..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8040} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 4 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --gpu-memory-utilization 0.9 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/scripts/upload-wheels.sh b/.buildkite/scripts/upload-wheels.sh index 8e38ace0bfbc2..3a218a4bb2e6d 100644 --- a/.buildkite/scripts/upload-wheels.sh +++ b/.buildkite/scripts/upload-wheels.sh @@ -34,9 +34,10 @@ if [[ ${#wheel_files[@]} -ne 1 ]]; then fi wheel="${wheel_files[0]}" -# current build image uses ubuntu 20.04, which corresponds to manylinux_2_31 +# default build image uses ubuntu 20.04, which corresponds to manylinux_2_31 +# we also accept params as manylinux tag # refer to https://github.com/mayeut/pep600_compliance?tab=readme-ov-file#acceptable-distros-to-build-wheels -manylinux_version="manylinux_2_31" +manylinux_version="${1:-manylinux_2_31}" # Rename 'linux' to the appropriate manylinux version in the wheel filename if [[ "$wheel" != *"linux"* ]]; then @@ -96,8 +97,11 @@ if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]]; aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/nightly/" fi -# copy to // only if it does not have "dev" in the version +# re-generate and copy to // only if it does not have "dev" in the version if [[ "$version" != *"dev"* ]]; then - echo "Uploading indices to overwrite /$pure_version/" + echo "Re-generating indices for /$pure_version/" + rm -rf "$INDICES_OUTPUT_DIR/*" + mkdir -p "$INDICES_OUTPUT_DIR" + $PYTHON .buildkite/scripts/generate-nightly-index.py --version "$pure_version" --current-objects "$obj_json" --output-dir "$INDICES_OUTPUT_DIR" --comment "version $pure_version" $alias_arg aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/$pure_version/" fi diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 6950ad774edd8..4038d32834e68 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -398,7 +398,8 @@ steps: timeout_in_minutes: 25 gpu: h100 source_file_dependencies: - - vllm/ + - vllm/v1/attention + - vllm/model_executor/layers - tests/v1/determinism/ commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn @@ -440,23 +441,29 @@ steps: working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints + - vllm/multimodal - examples/ commands: - pip install tensorizer # for tensorizer test + # for basic + - python3 offline_inference/basic/chat.py - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + # for multi-modal models - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 @@ -718,6 +725,18 @@ steps: - uv pip install --system conch-triton-kernels - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py +- label: LM Eval Small Models # 53min + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + # grade: Blocking + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + - label: OpenAI API correctness # 10min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental, amdproduction] @@ -727,7 +746,7 @@ steps: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py - commands: # LMEval + commands: # LMEval+Transcription WER check # Transcription WER check is skipped because encoder-decoder models are not supported on ROCm, see https://github.com/vllm-project/vllm/issues/27442 - pytest -s entrypoints/openai/correctness/ @@ -963,6 +982,19 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min + timeout_in_minutes: 180 + mirror_hardwares: [amdexperimental, amdproduction] + agent_pool: mi325_1 + # grade: Blocking + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 # 60min timeout_in_minutes: 120 mirror_hardwares: [amdexperimental] @@ -1098,7 +1130,6 @@ steps: - vllm/model_executor/layers/layernorm.py - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - - vllm/model_executor/layers/fused_moe/layer.py - tests/compile/test_fusion_attn.py - tests/compile/test_silu_mul_quant_fusion.py - tests/compile/distributed/test_fusion_all_reduce.py @@ -1132,12 +1163,25 @@ steps: - vllm/model_executor/layers/activation.py - vllm/model_executor/layers/quantization/input_quant_fp8.py - tests/compile/distributed/test_fusions_e2e.py - - tests/compile/fullgraph/test_full_graph.py commands: - nvidia-smi # Run all e2e fusion tests - pytest -v -s tests/compile/distributed/test_fusions_e2e.py +- label: Blackwell GPT-OSS Eval + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 working_dir: "/vllm-workspace/" @@ -1155,6 +1199,16 @@ steps: commands: - pytest -s -v tests/quantization/test_blackwell_moe.py +- label: Blackwell LM Eval Small Models + timeout_in_minutes: 120 + gpu: b200 + optional: true # run on nightlies + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 + ##### 1 GPU test ##### ##### multi gpus test ##### @@ -1397,6 +1451,39 @@ steps: - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +##### H100 test ##### +- label: LM Eval Large Models (H100) # optional + gpu: h100 + optional: true + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + + ##### H200 test ##### - label: Distributed Tests (H200) # optional mirror_hardwares: [amdexperimental] @@ -1440,29 +1527,6 @@ steps: commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 -- label: Blackwell LM Eval Small Models - timeout_in_minutes: 120 - gpu: b200 - optional: true # run on nightlies - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 - -- label: Multi-Modal Accuracy Eval (Small Models) # 10min - timeout_in_minutes: 70 - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_1 - # grade: Blocking - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - vllm/multimodal/ - - vllm/inputs/ - - vllm/v1/core/ - commands: - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - - label: LM Eval Large Models (4 Card) mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 @@ -1478,21 +1542,6 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 -- label: LM Eval Large Models (H100) # optional - mirror_hardwares: [amdexperimental, amdproduction] - agent_pool: mi325_4 - # grade: Blocking - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 - - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 - - label: ROCm LM Eval Large Models (8 Card) mirror_hardwares: [amdproduction] agent_pool: mi325_8 @@ -1517,6 +1566,20 @@ steps: - uv pip install --system 'gpt-oss[eval]==0.0.5' - VLLM_ROCM_USE_AITER_MHA=0 VLLM_ROCM_USE_AITER=1 VLLM_USE_AITER_UNIFIED_ATTENTION=1 pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 +##### RL Integration Tests ##### +- label: Prime-RL Integration Test # 15min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_2 + # grade: Blocking + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh - label: DeepSeek V2-Lite Accuracy mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_4 @@ -1550,17 +1613,26 @@ steps: commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 -##### RL Integration Tests ##### -- label: Prime-RL Integration Test # 15min +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] - agent_pool: mi325_2 + agent_pool: mi325_4 # grade: Blocking - timeout_in_minutes: 30 + gpu: h100 optional: true - num_gpus: 2 + num_gpus: 4 working_dir: "/vllm-workspace" - source_file_dependencies: - - vllm/ - - .buildkite/scripts/run-prime-rl-test.sh commands: - - bash .buildkite/scripts/run-prime-rl-test.sh + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + # grade: Blocking + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 0a99994e243ae..750e7c038351c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -468,7 +468,9 @@ steps: # tests covered elsewhere. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 @@ -482,7 +484,9 @@ steps: # as it is a heavy test that is covered in other steps. # Use `find` to launch multiple instances of pytest so that # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 - - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;" + # However, find does not normally propagate error codes, so we combine it with xargs + # (using -0 for proper path handling) + - "find compile/fullgraph -maxdepth 1 -name 'test_*.py' -not -name 'test_full_graph.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'" - label: PyTorch Fullgraph Test # 27min timeout_in_minutes: 40 @@ -1375,22 +1379,4 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - -- label: DeepSeek V2-Lite Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file diff --git a/.buildkite/test_areas/attention.yaml b/.buildkite/test_areas/attention.yaml new file mode 100644 index 0000000000000..6e444eae14c74 --- /dev/null +++ b/.buildkite/test_areas/attention.yaml @@ -0,0 +1,21 @@ +group: Attention +depends_on: + - image-build +steps: +- label: V1 attention (H100) + timeout_in_minutes: 30 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - pytest -v -s v1/attention + +- label: V1 attention (B200) + timeout_in_minutes: 30 + gpu: b200 + source_file_dependencies: + - vllm/v1/attention + - tests/v1/attention + commands: + - VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this diff --git a/.buildkite/test_areas/basic_correctness.yaml b/.buildkite/test_areas/basic_correctness.yaml new file mode 100644 index 0000000000000..759d2b5358714 --- /dev/null +++ b/.buildkite/test_areas/basic_correctness.yaml @@ -0,0 +1,16 @@ +group: Basic Correctness +depends_on: + - image-build +steps: +- label: Basic Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py diff --git a/.buildkite/test_areas/benchmarks.yaml b/.buildkite/test_areas/benchmarks.yaml new file mode 100644 index 0000000000000..574b642d407b0 --- /dev/null +++ b/.buildkite/test_areas/benchmarks.yaml @@ -0,0 +1,19 @@ +group: Benchmarks +depends_on: + - image-build +steps: +- label: Benchmarks + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/.buildkite" + source_file_dependencies: + - benchmarks/ + commands: + - bash scripts/run-benchmarks.sh + +- label: Benchmarks CLI Test + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/benchmarks/ + commands: + - pytest -v -s benchmarks/ diff --git a/.buildkite/test_areas/compile.yaml b/.buildkite/test_areas/compile.yaml new file mode 100644 index 0000000000000..0ba00925a4838 --- /dev/null +++ b/.buildkite/test_areas/compile.yaml @@ -0,0 +1,57 @@ +group: Compile +depends_on: + - image-build +steps: +- label: Fusion and Compile Tests (B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusion_attn.py + - tests/compile/test_silu_mul_quant_fusion.py + - tests/compile/distributed/test_fusion_all_reduce.py + - tests/compile/distributed/test_fusions_e2e.py + - tests/compile/fullgraph/test_full_graph.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py + - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'" + # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40) + - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile + +- label: Fusion E2E (2 GPUs)(B200) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/distributed/test_fusions_e2e.py + commands: + - nvidia-smi + # Run all e2e fusion tests + - pytest -v -s tests/compile/distributed/test_fusions_e2e.py + diff --git a/.buildkite/test_areas/cuda.yaml b/.buildkite/test_areas/cuda.yaml new file mode 100644 index 0000000000000..50c0c338c2434 --- /dev/null +++ b/.buildkite/test_areas/cuda.yaml @@ -0,0 +1,22 @@ +group: CUDA +depends_on: + - image-build +steps: +- label: Platform Tests (CUDA) + timeout_in_minutes: 15 + source_file_dependencies: + - vllm/ + - tests/cuda + commands: + - pytest -v -s cuda/test_cuda_context.py + +- label: Cudagraph + timeout_in_minutes: 20 + source_file_dependencies: + - tests/v1/cudagraph + - vllm/v1/cudagraph_dispatcher.py + - vllm/config/compilation.py + - vllm/compilation + commands: + - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py + - pytest -v -s v1/cudagraph/test_cudagraph_mode.py \ No newline at end of file diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml new file mode 100644 index 0000000000000..2cc90698d916a --- /dev/null +++ b/.buildkite/test_areas/distributed.yaml @@ -0,0 +1,199 @@ +group: Distributed +depends_on: + - image-build +steps: +- label: Distributed Comm Ops + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py + - pytest -v -s distributed/test_shm_buffer.py + - pytest -v -s distributed/test_shm_storage.py + +- label: Distributed (2 GPUs) + timeout_in_minutes: 90 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/compilation/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/worker/worker_base.py + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/compile/fullgraph/test_basic_correctness.py + - tests/compile/test_wrapper.py + - tests/distributed/ + - tests/entrypoints/llm/test_collective_rpc.py + - tests/v1/distributed + - tests/v1/entrypoints/openai/test_multi_api_servers.py + - tests/v1/shutdown + - tests/v1/worker/test_worker_memory_snapshot.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - pytest -v -s ./compile/fullgraph/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - pytest -v -s distributed/test_sequence_parallel.py + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown + - pytest -v -s v1/worker/test_worker_memory_snapshot.py + +- label: Distributed Tests (4 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/distributed/test_events + - tests/compile/fullgraph/test_basic_correctness.py + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + - tests/v1/distributed + - tests/v1/engine/test_engine_core_client.py + - tests/distributed/test_symm_mem_allreduce.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and external_dp=2 + - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=2 and pp=2 + - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py + # test with torchrun tp=4 and dp=1 + - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2, pp=2 and dp=1 + - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=1 and dp=4 with ep + - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with torchrun tp=2 and dp=2 with ep + - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py + # test with internal dp + - python3 ../examples/offline_inference/data_parallel.py --enforce-eager + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py + - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/fullgraph/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s distributed/test_events.py + - pytest -v -s distributed/test_symm_mem_allreduce.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - cd ../examples/offline_inference + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py + - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + +- label: Distributed Tests (8 GPUs)(H100) + timeout_in_minutes: 10 + gpu: h100 + num_gpus: 8 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - examples/offline_inference/torchrun_dp_example.py + - vllm/config/parallel.py + - vllm/distributed/ + - vllm/v1/engine/llm_engine.py + - vllm/v1/executor/uniproc_executor.py + - vllm/v1/worker/gpu_worker.py + commands: + # https://github.com/NVIDIA/nccl/issues/1838 + - export NCCL_CUMEM_HOST_ENABLE=0 + # test with torchrun tp=2 and dp=4 with ep + - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep + +- label: Distributed Tests (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: Distributed Tests (2 GPUs)(H200) + gpu: h200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py + - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py + - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4' + - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py + - pytest -v -s tests/distributed/test_context_parallel.py + - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: Distributed Tests (2 GPUs)(B200) + gpu: b200 + optional: true + working_dir: "/vllm-workspace/" + num_gpus: 2 + commands: + - pytest -v -s tests/distributed/test_context_parallel.py + - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py + - pytest -v -s tests/v1/distributed/test_dbo.py + +- label: 2 Node Test (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - tests/examples/offline_inference/data_parallel.py + commands: + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code" + +- label: Distributed NixlConnector PD accuracy (4 GPUs) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh + +- label: Pipeline + Context Parallelism (4 GPUs)) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py \ No newline at end of file diff --git a/.buildkite/test_areas/e2e_integration.yaml b/.buildkite/test_areas/e2e_integration.yaml new file mode 100644 index 0000000000000..93d389815edac --- /dev/null +++ b/.buildkite/test_areas/e2e_integration.yaml @@ -0,0 +1,59 @@ +group: E2E Integration +depends_on: + - image-build +steps: +- label: DeepSeek V2-Lite Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010 + +- label: Qwen3-30B-A3B-FP8-block Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 + +- label: Qwen3-30B-A3B-FP8-block Accuracy (B200) + timeout_in_minutes: 60 + gpu: b200 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +- label: Prime-RL Integration (2 GPUs) + timeout_in_minutes: 30 + optional: true + num_gpus: 2 + working_dir: "/vllm-workspace" + source_file_dependencies: + - vllm/ + - .buildkite/scripts/run-prime-rl-test.sh + commands: + - bash .buildkite/scripts/run-prime-rl-test.sh + +- label: DeepSeek V2-Lite Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 + +- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy + timeout_in_minutes: 60 + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace" + commands: + - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 diff --git a/.buildkite/test_areas/engine.yaml b/.buildkite/test_areas/engine.yaml new file mode 100644 index 0000000000000..a028e0e4af4c1 --- /dev/null +++ b/.buildkite/test_areas/engine.yaml @@ -0,0 +1,26 @@ +group: Engine +depends_on: + - image-build +steps: +- label: Engine + timeout_in_minutes: 15 + source_file_dependencies: + - vllm/ + - tests/engine + - tests/test_sequence + - tests/test_config + - tests/test_logger + - tests/test_vllm_port + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py + +- label: V1 e2e + engine + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + - pytest -v -s v1/engine diff --git a/.buildkite/test_areas/entrypoints.yaml b/.buildkite/test_areas/entrypoints.yaml new file mode 100644 index 0000000000000..0a789be943f37 --- /dev/null +++ b/.buildkite/test_areas/entrypoints.yaml @@ -0,0 +1,68 @@ +group: Entrypoints +depends_on: + - image-build +steps: +- label: Entrypoints Unit Tests + timeout_in_minutes: 10 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/entrypoints + - tests/entrypoints/ + commands: + - pytest -v -s entrypoints/openai/tool_parsers + - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling + +- label: Entrypoints Integration (LLM) + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests + +- label: Entrypoints Integration (API Server) + timeout_in_minutes: 130 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ + - pytest -v -s entrypoints/test_chat_utils.py + + +- label: Entrypoints Integration (Pooling) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/ + - tests/entrypoints/pooling + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/pooling + + +- label: Entrypoints V1 + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - pytest -v -s v1/entrypoints + +- label: OpenAI API Correctness + timeout_in_minutes: 30 + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ diff --git a/.buildkite/test_areas/expert_parallelism.yaml b/.buildkite/test_areas/expert_parallelism.yaml new file mode 100644 index 0000000000000..feb8252148c7f --- /dev/null +++ b/.buildkite/test_areas/expert_parallelism.yaml @@ -0,0 +1,23 @@ +group: Expert Parallelism +depends_on: + - image-build +steps: +- label: EPLB Algorithm + timeout_in_minutes: 15 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_algo.py + commands: + - pytest -v -s distributed/test_eplb_algo.py + +- label: EPLB Execution + timeout_in_minutes: 20 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/eplb + - tests/distributed/test_eplb_execute.py + commands: + - pytest -v -s distributed/test_eplb_execute.py + - pytest -v -s distributed/test_eplb_spec_decode.py \ No newline at end of file diff --git a/.buildkite/test_areas/kernels.yaml b/.buildkite/test_areas/kernels.yaml new file mode 100644 index 0000000000000..7ca099516d641 --- /dev/null +++ b/.buildkite/test_areas/kernels.yaml @@ -0,0 +1,117 @@ +group: Kernels +depends_on: + - image-build +steps: +- label: Kernels Core Operation Test + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - tests/kernels/core + - tests/kernels/test_top_k_per_row.py + commands: + - pytest -v -s kernels/core kernels/test_top_k_per_row.py + +- label: Kernels Attention Test %N + timeout_in_minutes: 35 + source_file_dependencies: + - csrc/attention/ + - vllm/attention + - vllm/v1/attention + - tests/kernels/attention + commands: + - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Quantization Test %N + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/quantization/ + - vllm/model_executor/layers/quantization + - tests/kernels/quantization + commands: + - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels MoE Test %N + timeout_in_minutes: 60 + source_file_dependencies: + - csrc/quantization/cutlass_w8a8/moe/ + - csrc/moe/ + - tests/kernels/moe + - vllm/model_executor/layers/fused_moe/ + - vllm/distributed/device_communicators/ + - vllm/envs.py + - vllm/config + commands: + - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 2 + +- label: Kernels Mamba Test + timeout_in_minutes: 45 + source_file_dependencies: + - csrc/mamba/ + - tests/kernels/mamba + - vllm/model_executor/layers/mamba/ops + commands: + - pytest -v -s kernels/mamba + +- label: Kernels DeepGEMM Test (H100) + timeout_in_minutes: 45 + gpu: h100 + num_gpus: 1 + source_file_dependencies: + - tools/install_deepgemm.sh + - vllm/utils/deep_gemm.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization + - tests/kernels/quantization/test_block_fp8.py + - tests/kernels/moe/test_deepgemm.py + - tests/kernels/moe/test_batched_deepgemm.py + - tests/kernels/attention/test_deepgemm_attention.py + commands: + - pytest -v -s kernels/quantization/test_block_fp8.py -k deep_gemm + - pytest -v -s kernels/moe/test_deepgemm.py + - pytest -v -s kernels/moe/test_batched_deepgemm.py + - pytest -v -s kernels/attention/test_deepgemm_attention.py + +- label: Kernels (B200) + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/" + gpu: b200 + # optional: true + source_file_dependencies: + - csrc/quantization/fp4/ + - csrc/attention/mla/ + - csrc/quantization/cutlass_w8a8/moe/ + - vllm/model_executor/layers/fused_moe/cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py + - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/attention/backends/mla/cutlass_mla.py + - vllm/v1/attention/backends/mla/flashinfer_mla.py + - vllm/platforms/cuda.py + - vllm/attention/selector.py + commands: + - nvidia-smi + - python3 examples/offline_inference/basic/chat.py + # Attention + # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 + - pytest -v -s tests/kernels/attention/test_attention_selector.py + - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' + - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py + - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py + - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py + # Quantization + - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' + - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py + - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py + - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py + - pytest -v -s tests/kernels/moe/test_flashinfer.py + - pytest -v -s tests/kernels/moe/test_cutedsl_moe.py \ No newline at end of file diff --git a/.buildkite/test_areas/lm_eval.yaml b/.buildkite/test_areas/lm_eval.yaml new file mode 100644 index 0000000000000..9af43e0c375a8 --- /dev/null +++ b/.buildkite/test_areas/lm_eval.yaml @@ -0,0 +1,46 @@ +group: LM Eval +depends_on: + - image-build +steps: +- label: LM Eval Small Models + timeout_in_minutes: 75 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + autorun_on_main: true + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 + +- label: LM Eval Large Models (4 GPUs)(A100) + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 + +- label: LM Eval Large Models (4 GPUs)(H100) + gpu: h100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100 + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4 + +- label: LM Eval Small Models (B200) + timeout_in_minutes: 120 + gpu: b200 + optional: true + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 diff --git a/.buildkite/test_areas/lora.yaml b/.buildkite/test_areas/lora.yaml new file mode 100644 index 0000000000000..809b4138f44ba --- /dev/null +++ b/.buildkite/test_areas/lora.yaml @@ -0,0 +1,31 @@ +group: LoRA +depends_on: + - image-build +steps: +- label: LoRA %N + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py + parallelism: 4 + + +- label: LoRA TP (Distributed) + timeout_in_minutes: 30 + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_llm_with_multi_loras.py + - pytest -v -s -x lora/test_olmoe_tp.py + - pytest -v -s -x lora/test_gptoss_tp.py \ No newline at end of file diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml new file mode 100644 index 0000000000000..072bccadb726a --- /dev/null +++ b/.buildkite/test_areas/misc.yaml @@ -0,0 +1,163 @@ +group: Miscellaneous +depends_on: + - image-build +steps: +- label: V1 Others + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + # split the test to avoid interference + - pytest -v -s -m 'not cpu_test' v1/core + - pytest -v -s v1/executor + - pytest -v -s v1/kv_offload + - pytest -v -s v1/sample + - pytest -v -s v1/logits_processors + - pytest -v -s v1/worker + - pytest -v -s v1/spec_decode + - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'not cpu_test' v1/metrics + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py + - pytest -v -s v1/test_outputs.py + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: V1 Others (CPU) + depends_on: ~ + source_file_dependencies: + - vllm/ + - tests/v1 + no_gpu: true + commands: + # split the test to avoid interference + - pytest -v -s -m 'cpu_test' v1/core + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_serial_utils.py + - pytest -v -s -m 'cpu_test' v1/kv_connector/unit + - pytest -v -s -m 'cpu_test' v1/metrics + +- label: Regression + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional + +- label: Examples + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/examples" + source_file_dependencies: + - vllm/entrypoints + - vllm/multimodal + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/chat.py # for basic + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + # for multi-modal models + - python3 offline_inference/audio_language.py --seed 0 + - python3 offline_inference/vision_language.py --seed 0 + - python3 offline_inference/vision_language_multi_image.py --seed 0 + - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 + # for pooling models + - python3 pooling/pooling/vision_language_pooling.py --seed 0 + # for features demo + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 + # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU + - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536 + +- label: Metrics, Tracing (2 GPUs) + timeout_in_minutes: 20 + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/v1/tracing + commands: + - "pip install \ + 'opentelemetry-sdk>=1.26.0' \ + 'opentelemetry-api>=1.26.0' \ + 'opentelemetry-exporter-otlp>=1.26.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1'" + - pytest -v -s v1/tracing + +- label: Python-only Installation + depends_on: ~ + timeout_in_minutes: 20 + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh + +- label: Async Engine, Inputs, Utils, Worker + timeout_in_minutes: 50 + source_file_dependencies: + - vllm/ + - tests/multimodal + - tests/utils_ + commands: + - pytest -v -s -m 'not cpu_test' multimodal + - pytest -v -s utils_ + +- label: Async Engine, Inputs, Utils, Worker, Config (CPU) + depends_on: ~ + timeout_in_minutes: 20 + source_file_dependencies: + - vllm/ + - tests/test_inputs.py + - tests/test_outputs.py + - tests/multimodal + - tests/standalone_tests/lazy_imports.py + - tests/tokenizers_ + - tests/transformers_utils + - tests/config + no_gpu: true + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s test_inputs.py + - pytest -v -s test_outputs.py + - pytest -v -s -m 'cpu_test' multimodal + - pytest -v -s tokenizers_ + - pytest -v -s transformers_utils + - pytest -v -s config + +- label: GPT-OSS Eval (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + source_file_dependencies: + - tests/evals/gpt_oss + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - uv pip install --system 'gpt-oss[eval]==0.0.5' + - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 + +- label: Batch Invariance (H100) + timeout_in_minutes: 25 + gpu: h100 + source_file_dependencies: + - vllm/v1/attention + - vllm/model_executor/layers + - tests/v1/determinism/ + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pip install pytest-timeout pytest-forked + - pytest -v -s v1/determinism/test_batch_invariance.py + - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py \ No newline at end of file diff --git a/.buildkite/test_areas/model_executor.yaml b/.buildkite/test_areas/model_executor.yaml new file mode 100644 index 0000000000000..996c8bb8b780a --- /dev/null +++ b/.buildkite/test_areas/model_executor.yaml @@ -0,0 +1,17 @@ +group: Model Executor +depends_on: + - image-build +steps: +- label: Model Executor + timeout_in_minutes: 35 + source_file_dependencies: + - vllm/engine/arg_utils.py + - vllm/config/model.py + - vllm/model_executor + - tests/model_executor + - tests/entrypoints/openai/test_tensorizer_entrypoint.py + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s model_executor + - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml new file mode 100644 index 0000000000000..39a5d51c48833 --- /dev/null +++ b/.buildkite/test_areas/models_basic.yaml @@ -0,0 +1,62 @@ +group: Models - Basic +depends_on: + - image-build +steps: +- label: Basic Models Tests (Initialization) + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/test_initialization.py + commands: + # Run a subset of model initialization tests + - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset + +- label: Basic Models Tests (Extra Initialization) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/test_initialization.py + commands: + # Only when vLLM model source is modified - test initialization of a large + # subset of supported models (the complement of the small subset in the above + # test.) Also run if model initialization test file is modified + - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Basic Models Tests (Other) + timeout_in_minutes: 45 + source_file_dependencies: + - vllm/ + - tests/models/test_transformers.py + - tests/models/test_registry.py + commands: + - pytest -v -s models/test_transformers.py models/test_registry.py + +- label: Basic Models Test (Other CPU) # 5min + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/models/test_utils.py + - tests/models/test_vision.py + no_gpu: true + commands: + - pytest -v -s models/test_utils.py models/test_vision.py + +- label: Transformers Nightly Models + working_dir: "/vllm-workspace/" + optional: true + soft_fail: true + commands: + - pip install --upgrade git+https://github.com/huggingface/transformers + - pytest -v -s tests/models/test_initialization.py + - pytest -v -s tests/models/test_transformers.py + - pytest -v -s tests/models/multimodal/processing/ + - pytest -v -s tests/models/multimodal/test_mapping.py + - python3 examples/offline_inference/basic/chat.py + - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl + # Whisper needs spawn method to avoid deadlock + - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper diff --git a/.buildkite/test_areas/models_distributed.yaml b/.buildkite/test_areas/models_distributed.yaml new file mode 100644 index 0000000000000..b6bfbf2ddab47 --- /dev/null +++ b/.buildkite/test_areas/models_distributed.yaml @@ -0,0 +1,22 @@ +group: Models - Distributed +depends_on: + - image-build +steps: +- label: Distributed Model Tests (2 GPUs) + timeout_in_minutes: 50 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/model_executor/model_loader/sharded_state_loader.py + - vllm/model_executor/models/ + - tests/basic_correctness/ + - tests/model_executor/model_loader/test_sharded_state_loader.py + - tests/models/ + commands: + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/language -v -s -m 'distributed(num_gpus=2)' + - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py + - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' diff --git a/.buildkite/test_areas/models_language.yaml b/.buildkite/test_areas/models_language.yaml new file mode 100644 index 0000000000000..f70192c4ebc0a --- /dev/null +++ b/.buildkite/test_areas/models_language.yaml @@ -0,0 +1,91 @@ +group: Models - Language +depends_on: + - image-build +steps: +- label: Language Models Tests (Standard) + timeout_in_minutes: 25 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language + commands: + # Test standard language models, excluding a subset of slow tests + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and (not slow_test)' + +- label: Language Models Tests (Extra Standard) %N + timeout_in_minutes: 45 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/model_executor/models/ + - tests/models/language/pooling/test_embedding.py + - tests/models/language/generation/test_common.py + - tests/models/language/pooling/test_classification.py + commands: + # Shard slow subset of standard language models tests. Only run when model + # source is modified, or when specified test files are modified + - pip freeze | grep -E 'torch' + - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Tests (Hybrid) %N + timeout_in_minutes: 75 + mirror_hardwares: [amdexperimental] + torch_nightly: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + # Shard hybrid language model tests + - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB + parallelism: 2 + +- label: Language Models Test (Extended Generation) # 80min + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation + commands: + # Install fast path packages for testing against transformers + # Note: also needed to run plamo2 model in vLLM + - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' + - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' + +- label: Language Models Test (PPL) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/generation_ppl_test + commands: + - pytest -v -s models/language/generation_ppl_test + +- label: Language Models Test (Extended Pooling) # 36min + timeout_in_minutes: 50 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling + commands: + - pytest -v -s models/language/pooling -m 'not core_model' + +- label: Language Models Test (MTEB) + timeout_in_minutes: 110 + mirror_hardwares: [amdexperimental] + optional: true + source_file_dependencies: + - vllm/ + - tests/models/language/pooling_mteb_test + commands: + - pytest -v -s models/language/pooling_mteb_test diff --git a/.buildkite/test_areas/models_multimodal.yaml b/.buildkite/test_areas/models_multimodal.yaml new file mode 100644 index 0000000000000..fc24068c20a46 --- /dev/null +++ b/.buildkite/test_areas/models_multimodal.yaml @@ -0,0 +1,79 @@ +group: Models - Multimodal +depends_on: + - image-build +steps: +- label: Multi-Modal Models (Standard) # 60min + timeout_in_minutes: 80 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pip freeze | grep -E 'torch' + - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing + - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work + +- label: Multi-Modal Processor Test (CPU) + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + no_gpu: true + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Processor # 44min + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/processing/test_tensor_schema.py + +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + +- label: Multi-Modal Models (Extended) 1 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing + +- label: Multi-Modal Models (Extended) 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' + +- label: Multi-Modal Models (Extended) 3 + optional: true + source_file_dependencies: + - vllm/ + - tests/models/multimodal + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' + +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* diff --git a/.buildkite/test_areas/plugins.yaml b/.buildkite/test_areas/plugins.yaml new file mode 100644 index 0000000000000..60c179aa098e1 --- /dev/null +++ b/.buildkite/test_areas/plugins.yaml @@ -0,0 +1,34 @@ +group: Plugins +depends_on: + - image-build +steps: +- label: Plugin Tests (2 GPUs) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin + - pip install -e ./plugins/prithvi_io_processor_plugin + - pytest -v -s plugins_tests/test_io_processor_plugins.py + - pip uninstall prithvi_io_processor_plugin -y + # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process + - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins diff --git a/.buildkite/test_areas/pytorch.yaml b/.buildkite/test_areas/pytorch.yaml new file mode 100644 index 0000000000000..703c82eb1a91b --- /dev/null +++ b/.buildkite/test_areas/pytorch.yaml @@ -0,0 +1,50 @@ +group: PyTorch +depends_on: + - image-build +steps: +- label: PyTorch Compilation Unit Tests + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run unit tests defined directly under compile/, + # not including subdirectories, which are usually heavier + # tests covered elsewhere. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/ -maxdepth 1 -name 'test_*.py' -exec pytest -s -v {} \\;" + +- label: PyTorch Fullgraph Smoke Test + timeout_in_minutes: 30 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # Run smoke tests under fullgraph directory, except test_full_graph.py + # as it is a heavy test that is covered in other steps. + # Use `find` to launch multiple instances of pytest so that + # they do not suffer from https://github.com/vllm-project/vllm/issues/28965 + - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;" + +- label: PyTorch Fullgraph + timeout_in_minutes: 40 + source_file_dependencies: + - vllm/ + - tests/compile + commands: + # fp8 kv scales not supported on sm89, tested on Blackwell instead + - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile' + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'" + +- label: Pytorch Nightly Dependency Override Check # 2min + # if this test fails, it means the nightly torch version is not compatible with some + # of the dependencies. Please check the error message and add the package to whitelist + # in /vllm/tools/pre_commit/generate_nightly_torch_test.py + soft_fail: true + source_file_dependencies: + - requirements/nightly_torch_test.txt + commands: + - bash standalone_tests/pytorch_nightly_dependency.sh \ No newline at end of file diff --git a/.buildkite/test_areas/quantization.yaml b/.buildkite/test_areas/quantization.yaml new file mode 100644 index 0000000000000..6e89d6af3b8d1 --- /dev/null +++ b/.buildkite/test_areas/quantization.yaml @@ -0,0 +1,46 @@ +group: Quantization +depends_on: + - image-build +steps: +- label: Quantization + timeout_in_minutes: 90 + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + commands: + # temporary install here since we need nightly, will move to requirements/test.in + # after torchao 0.12 release, and pin a working version of torchao nightly here + + # since torchao nightly is only compatible with torch nightly currently + # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now + # we can only upgrade after this is resolved + # TODO(jerryzh168): resolve the above comment + - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129 + - uv pip install --system conch-triton-kernels + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py + +- label: Quantized MoE Test (B200) + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - tests/quantization/test_blackwell_moe.py + - vllm/model_executor/models/deepseek_v2.py + - vllm/model_executor/models/gpt_oss.py + - vllm/model_executor/models/llama4.py + - vllm/model_executor/layers/fused_moe + - vllm/model_executor/layers/quantization/compressed_tensors + - vllm/model_executor/layers/quantization/modelopt.py + - vllm/model_executor/layers/quantization/mxfp4.py + - vllm/v1/attention/backends/flashinfer.py + commands: + - pytest -s -v tests/quantization/test_blackwell_moe.py + +- label: Quantized Models Test + timeout_in_minutes: 60 + source_file_dependencies: + - vllm/model_executor/layers/quantization + - tests/models/quantization + commands: + - pytest -v -s models/quantization diff --git a/.buildkite/test_areas/samplers.yaml b/.buildkite/test_areas/samplers.yaml new file mode 100644 index 0000000000000..ad377148fd073 --- /dev/null +++ b/.buildkite/test_areas/samplers.yaml @@ -0,0 +1,14 @@ +group: Samplers +depends_on: + - image-build +steps: +- label: Samplers Test + timeout_in_minutes: 75 + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers diff --git a/.buildkite/test_areas/tool_use.yaml b/.buildkite/test_areas/tool_use.yaml new file mode 100644 index 0000000000000..7040cd1d253b3 --- /dev/null +++ b/.buildkite/test_areas/tool_use.yaml @@ -0,0 +1,23 @@ +group: Tool use +depends_on: + - image-build +steps: +- label: OpenAI-Compatible Tool Use + timeout_in_minutes: 35 + mirror_hardwares: [amdexperimental] + fast_check: false + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s -m 'not cpu_test' tool_use + +- label: OpenAI-Compatible Tool Use (CPU) + depends_on: ~ + timeout_in_minutes: 10 + source_file_dependencies: + - vllm/ + - tests/tool_use + no_gpu: true + commands: + - pytest -v -s -m 'cpu_test' tool_use diff --git a/.buildkite/test_areas/weight_loading.yaml b/.buildkite/test_areas/weight_loading.yaml new file mode 100644 index 0000000000000..cfc5bb20fe7ad --- /dev/null +++ b/.buildkite/test_areas/weight_loading.yaml @@ -0,0 +1,25 @@ +group: Weight Loading +depends_on: + - image-build +steps: +- label: Weight Loading Multiple GPU # 33min + timeout_in_minutes: 45 + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt + +- label: Weight Loading Multiple GPU - Large Models # optional + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml index 56fbe5ca704a1..df8910837715d 100644 --- a/.github/workflows/cleanup_pr_body.yml +++ b/.github/workflows/cleanup_pr_body.yml @@ -13,7 +13,7 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - name: Set up Python uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 diff --git a/.github/workflows/macos-smoke-test.yml b/.github/workflows/macos-smoke-test.yml index 3a12c4b3a8300..e80a5c0cc80f9 100644 --- a/.github/workflows/macos-smoke-test.yml +++ b/.github/workflows/macos-smoke-test.yml @@ -12,7 +12,7 @@ jobs: timeout-minutes: 30 steps: - - uses: actions/checkout@v6 + - uses: actions/checkout@v6.0.1 - uses: astral-sh/setup-uv@v7 with: diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index a03b979ad761d..1041653c2f57e 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -16,7 +16,7 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - uses: actions/setup-python@83679a892e2d95755f2dac6acb0bfd1e9ac5d548 # v6.1.0 with: python-version: "3.12" diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index c8a52f1a63269..44bf71db5e9de 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -15,7 +15,7 @@ jobs: actions: write runs-on: ubuntu-latest steps: - - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 + - uses: actions/stale@997185467fa4f803885201cee163a9f38240193d # v10.1.1 with: # Increasing this value ensures that changes to this workflow # propagate to all issues and PRs in days rather than months diff --git a/CMakeLists.txt b/CMakeLists.txt index e09972fe71995..6b93e3fe91603 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -874,7 +874,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}") if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS) set(SRCS - "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu") + "csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu" + "csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu" + "csrc/quantization/cutlass_w4a8/w4a8_utils.cu" + ) set_gencode_flags_for_srcs( SRCS "${SRCS}" @@ -944,7 +947,6 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1) set(VLLM_MOE_EXT_SRC "csrc/moe/torch_bindings.cpp" "csrc/moe/moe_align_sum_kernels.cu" - "csrc/moe/moe_lora_align_sum_kernels.cu" "csrc/moe/topk_softmax_kernels.cu") if(VLLM_GPU_LANG STREQUAL "CUDA") diff --git a/benchmarks/auto_tune/auto_tune.sh b/benchmarks/auto_tune/auto_tune.sh index 56b721cbb4021..25baa9cbda39c 100644 --- a/benchmarks/auto_tune/auto_tune.sh +++ b/benchmarks/auto_tune/auto_tune.sh @@ -96,8 +96,9 @@ start_server() { # This correctly passes each element as a separate argument. if [[ -n "$profile_dir" ]]; then # Start server with profiling enabled - VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ - vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & + local profile_config_json="{\"profiler\": \"torch\", \"torch_profiler_dir\": \"$profile_dir\"}" + VLLM_SERVER_DEV_MODE=1 \ + vllm serve --profiler-config "$profile_config_json" "${common_args_array[@]}" > "$vllm_log" 2>&1 & else # Start server without profiling VLLM_SERVER_DEV_MODE=1 \ diff --git a/benchmarks/benchmark_ngram_proposer.py b/benchmarks/benchmark_ngram_proposer.py index cac401456b62a..b5373d383b548 100644 --- a/benchmarks/benchmark_ngram_proposer.py +++ b/benchmarks/benchmark_ngram_proposer.py @@ -32,12 +32,11 @@ def benchmark_propose(args): model_config = ModelConfig( model="facebook/opt-125m", - task="generate", max_model_len=args.num_token + args.num_spec_token, tokenizer="facebook/opt-125m", tokenizer_mode="auto", dtype="auto", - seed=None, + seed=0, trust_remote_code=False, ) proposer = NgramProposer( diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py index df122b4c5e8db..33aca831883aa 100644 --- a/benchmarks/benchmark_serving_structured_output.py +++ b/benchmarks/benchmark_serving_structured_output.py @@ -574,7 +574,7 @@ async def benchmark( ) print( "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput + "Total token throughput (tok/s):", metrics.total_token_throughput ) ) @@ -963,8 +963,7 @@ def create_argument_parser(): parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) parser.add_argument( "--result-dir", diff --git a/benchmarks/kernels/benchmark_mla_k_concat.py b/benchmarks/kernels/benchmark_mla_k_concat.py new file mode 100644 index 0000000000000..fb3b6c8f12003 --- /dev/null +++ b/benchmarks/kernels/benchmark_mla_k_concat.py @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Benchmark script comparing torch.cat vs direct copy for k_nope/k_pe concatenation +in MLA (Multi-head Latent Attention) prefill. + +This validates that the optimization from commit 8d4142bd is beneficial across +various batch sizes, not just the originally tested batch size of 32768. +""" + +import time +from collections.abc import Callable + +import torch + +# DeepSeek-V3 MLA dimensions +NUM_HEADS = 128 +QK_NOPE_HEAD_DIM = 128 +PE_DIM = 64 + + +def cat_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor: + """Original torch.cat approach with expand.""" + return torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + + +def direct_copy_method(k_nope: torch.Tensor, k_pe: torch.Tensor) -> torch.Tensor: + """Optimized direct copy approach (avoids expand + cat overhead).""" + k = torch.empty( + (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]), + dtype=k_nope.dtype, + device=k_nope.device, + ) + k[..., : k_nope.shape[-1]] = k_nope + k[..., k_nope.shape[-1] :] = k_pe + return k + + +def benchmark_method( + method: Callable, + k_nope: torch.Tensor, + k_pe: torch.Tensor, + num_warmup: int = 10, + num_iters: int = 100, +) -> float: + """Benchmark a concatenation method and return mean latency in ms.""" + # Warmup + for _ in range(num_warmup): + _ = method(k_nope, k_pe) + torch.cuda.synchronize() + + # Benchmark + start = time.perf_counter() + for _ in range(num_iters): + _ = method(k_nope, k_pe) + torch.cuda.synchronize() + end = time.perf_counter() + + return (end - start) / num_iters * 1000 # Convert to ms + + +@torch.inference_mode() +def run_benchmark(dtype: torch.dtype, dtype_name: str): + """Run benchmark for a specific dtype.""" + torch.set_default_device("cuda") + + # Batch sizes to test (powers of 2 from 32 to 65536) + batch_sizes = [32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536] + + print("=" * 80) + print("Benchmark: torch.cat vs direct copy for MLA k_nope/k_pe concatenation") + print("=" * 80) + print( + f"Tensor shapes: k_nope=[B, {NUM_HEADS}, {QK_NOPE_HEAD_DIM}], " + f"k_pe=[B, 1, {PE_DIM}]" + ) + print(f"dtype: {dtype_name}") + print() + print( + f"{'Batch Size':>12} | {'cat (ms)':>10} | {'direct (ms)':>12} | " + f"{'Speedup':>8} | {'Reduction':>10}" + ) + print("-" * 70) + + results = [] + for batch_size in batch_sizes: + # Create input tensors (generate in float32 then convert for FP8 compatibility) + k_nope = torch.randn( + batch_size, NUM_HEADS, QK_NOPE_HEAD_DIM, dtype=torch.float32, device="cuda" + ).to(dtype) + k_pe = torch.randn( + batch_size, 1, PE_DIM, dtype=torch.float32, device="cuda" + ).to(dtype) + + # Benchmark both methods + cat_time = benchmark_method(cat_method, k_nope, k_pe) + direct_time = benchmark_method(direct_copy_method, k_nope, k_pe) + + speedup = cat_time / direct_time + reduction = (1 - direct_time / cat_time) * 100 + + results.append((batch_size, cat_time, direct_time, speedup, reduction)) + + print( + f"{batch_size:>12} | {cat_time:>10.3f} | {direct_time:>12.3f} | " + f"{speedup:>7.2f}x | {reduction:>9.1f}%" + ) + + print("=" * 80) + + # Summary statistics + speedups = [r[3] for r in results] + print("\nSpeedup summary:") + print(f" Min: {min(speedups):.2f}x") + print(f" Max: {max(speedups):.2f}x") + print(f" Mean: {sum(speedups) / len(speedups):.2f}x") + + # Find crossover point + crossover_batch = None + for batch_size, _, _, speedup, _ in results: + if speedup >= 1.0: + crossover_batch = batch_size + break + + print("\nConclusion:") + if crossover_batch: + print(f" - Direct copy becomes beneficial at batch size >= {crossover_batch}") + # Filter for large batches (>= 512 which is typical for prefill) + large_batch_speedups = [r[3] for r in results if r[0] >= 512] + if large_batch_speedups: + avg_large = sum(large_batch_speedups) / len(large_batch_speedups) + print(f" - For batch sizes >= 512: avg speedup = {avg_large:.2f}x") + print(" - MLA prefill typically uses large batches, so optimization is effective") + + return results + + +@torch.inference_mode() +def main(): + # Test bfloat16 + print("\n") + run_benchmark(torch.bfloat16, "bfloat16") + + # Test float8_e4m3fn + print("\n") + run_benchmark(torch.float8_e4m3fn, "float8_e4m3fn") + + +if __name__ == "__main__": + main() diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake index fbbb03c5ed465..85b286f8d8d0a 100644 --- a/cmake/cpu_extension.cmake +++ b/cmake/cpu_extension.cmake @@ -251,17 +251,6 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON endif() # Build ACL with CMake - set(ARM_COMPUTE_BUILD_SHARED_LIB "OFF") - set(CMAKE_BUILD_TYPE "Release") - set(ARM_COMPUTE_ARCH "armv8.2-a") - set(ARM_COMPUTE_ENABLE_ASSERTS "OFF") - set(ARM_COMPUTE_ENABLE_CPPTHREADS "OFF") - set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER") - set(ARM_COMPUTE_ENABLE_OPENMP "ON") - set(ARM_COMPUTE_ENABLE_WERROR "OFF") - set(ARM_COMPUTE_BUILD_EXAMPLES "OFF") - set(ARM_COMPUTE_BUILD_TESTING "OFF") - set(_cmake_config_cmd ${CMAKE_COMMAND} -G Ninja -B build -DARM_COMPUTE_BUILD_SHARED_LIB=OFF diff --git a/csrc/cpu/cpu_attn.cpp b/csrc/cpu/cpu_attn.cpp index 92f8bee5a47a0..02c722ba031a4 100644 --- a/csrc/cpu/cpu_attn.cpp +++ b/csrc/cpu/cpu_attn.cpp @@ -117,7 +117,6 @@ torch::Tensor get_scheduler_metadata( input.casual = casual; input.isa = isa; input.enable_kv_split = enable_kv_split; - TORCH_CHECK(casual, "Only supports casual mask for now."); VLLM_DISPATCH_FLOATING_TYPES(dtype, "get_scheduler_metadata", [&]() { CPU_ATTN_DISPATCH_CASE_HEADDIM(head_dim, [&] { diff --git a/csrc/cpu/cpu_attn_impl.hpp b/csrc/cpu/cpu_attn_impl.hpp index 02164ed3666e3..e3e077b845f4f 100644 --- a/csrc/cpu/cpu_attn_impl.hpp +++ b/csrc/cpu/cpu_attn_impl.hpp @@ -186,7 +186,7 @@ struct AttentionMetadata { // - Intermediate outputs: q_tile_size * head_dim * output_buffer_elem_size + 2 // * q_tile_size * 4, partial output, max + sum (float) // Reduction scratchpad contains: -// - flags: bool array to indicate wether the split is finished +// - flags: bool array to indicate whether the split is finished // - outputs: split_num * q_tile_size * head_dim * output_buffer_elem_size // - max, sum: 2 * split_num * q_tile_size * 4 class AttentionScratchPad { diff --git a/csrc/moe/grouped_topk_kernels.cu b/csrc/moe/grouped_topk_kernels.cu index 69b4c1fb11d1a..47ee5f021eb4a 100644 --- a/csrc/moe/grouped_topk_kernels.cu +++ b/csrc/moe/grouped_topk_kernels.cu @@ -444,23 +444,27 @@ __device__ inline T apply_sigmoid(T val) { return cuda_cast(sigmoid_accurate(f)); } -template +template +__device__ inline T apply_scoring(T val) { + if constexpr (SF == SCORING_SIGMOID) { + return apply_sigmoid(val); + } else { + return val; + } +} + +template __device__ void topk_with_k2(T* output, T const* input, T const* bias, cg::thread_block_tile<32> const& tile, int32_t const lane_id, - int const num_experts_per_group, - int const scoring_func) { + int const num_experts_per_group) { // Get the top2 per thread T largest = neg_inf(); T second_largest = neg_inf(); if (num_experts_per_group > WARP_SIZE) { for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { - T value = input[i]; - // Apply scoring function if needed - if (scoring_func == SCORING_SIGMOID) { - value = apply_sigmoid(value); - } + T value = apply_scoring(input[i]); value = value + bias[i]; if (value > largest) { @@ -472,11 +476,7 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias, } } else { for (int i = lane_id; i < num_experts_per_group; i += WARP_SIZE) { - T value = input[i]; - // Apply scoring function if needed - if (scoring_func == SCORING_SIGMOID) { - value = apply_sigmoid(value); - } + T value = apply_scoring(input[i]); value = value + bias[i]; largest = value; } @@ -501,13 +501,12 @@ __device__ void topk_with_k2(T* output, T const* input, T const* bias, } } -template +template __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias, int64_t const num_tokens, int64_t const num_cases, int64_t const n_group, - int64_t const num_experts_per_group, - int const scoring_func) { + int64_t const num_experts_per_group) { int32_t warp_id = threadIdx.x / WARP_SIZE; int32_t lane_id = threadIdx.x % WARP_SIZE; @@ -525,21 +524,21 @@ __global__ void topk_with_k2_kernel(T* output, T* input, T const* bias, #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) asm volatile("griddepcontrol.wait;"); #endif - topk_with_k2(output, input, group_bias, tile, lane_id, - num_experts_per_group, scoring_func); + topk_with_k2(output, input, group_bias, tile, lane_id, + num_experts_per_group); } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) asm volatile("griddepcontrol.launch_dependents;"); #endif } -template +template __global__ void group_idx_and_topk_idx_kernel( T* scores, T const* group_scores, float* topk_values, IdxT* topk_indices, T const* bias, int64_t const num_tokens, int64_t const n_group, int64_t const topk_group, int64_t const topk, int64_t const num_experts, int64_t const num_experts_per_group, bool renormalize, - double routed_scaling_factor, int scoring_func) { + double routed_scaling_factor) { int32_t warp_id = threadIdx.x / WARP_SIZE; int32_t lane_id = threadIdx.x % WARP_SIZE; int32_t case_id = @@ -549,6 +548,11 @@ __global__ void group_idx_and_topk_idx_kernel( topk_values += case_id * topk; topk_indices += case_id * topk; + constexpr bool kUseStaticNGroup = (NGroup > 0); + // use int32 to avoid implicit conversion + int32_t const n_group_i32 = + kUseStaticNGroup ? NGroup : static_cast(n_group); + int32_t align_num_experts_per_group = warp_topk::round_up_to_multiple_of(num_experts_per_group); @@ -574,13 +578,14 @@ __global__ void group_idx_and_topk_idx_kernel( if (case_id < num_tokens) { // calculate group_idx - int32_t target_num_min = WARP_SIZE - n_group + topk_group; + int32_t target_num_min = + WARP_SIZE - n_group_i32 + static_cast(topk_group); // The check is necessary to avoid abnormal input - if (lane_id < n_group && is_finite(group_scores[lane_id])) { + if (lane_id < n_group_i32 && is_finite(group_scores[lane_id])) { value = group_scores[lane_id]; } - int count_equal_to_top_value = WARP_SIZE - n_group; + int count_equal_to_top_value = WARP_SIZE - n_group_i32; int pre_count_equal_to_top_value = 0; // Use loop to find the largset top_group while (count_equal_to_top_value < target_num_min) { @@ -604,7 +609,7 @@ __global__ void group_idx_and_topk_idx_kernel( int count_equalto_topkth_group = 0; bool if_proceed_next_topk = topk_group_value != neg_inf(); if (case_id < num_tokens && if_proceed_next_topk) { - for (int i_group = 0; i_group < n_group; i_group++) { + auto process_group = [&](int i_group) { if ((group_scores[i_group] > topk_group_value) || ((group_scores[i_group] == topk_group_value) && (count_equalto_topkth_group < num_equalto_topkth_group))) { @@ -613,11 +618,10 @@ __global__ void group_idx_and_topk_idx_kernel( i += WARP_SIZE) { T candidates = neg_inf(); if (i < num_experts_per_group) { - // Apply scoring function (if any) and add bias + // apply scoring function (if any) and add bias T input = scores[offset + i]; if (is_finite(input)) { - T score = (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) - : input; + T score = apply_scoring(input); candidates = score + bias[offset + i]; } } @@ -627,6 +631,17 @@ __global__ void group_idx_and_topk_idx_kernel( count_equalto_topkth_group++; } } + }; + + if constexpr (kUseStaticNGroup) { +#pragma unroll + for (int i_group = 0; i_group < NGroup; ++i_group) { + process_group(i_group); + } + } else { + for (int i_group = 0; i_group < n_group_i32; ++i_group) { + process_group(i_group); + } } queue.done(); __syncwarp(); @@ -646,12 +661,13 @@ __global__ void group_idx_and_topk_idx_kernel( if (i < topk) { // Load the score value (without bias) for normalization T input = scores[s_topk_idx[i]]; - value = - (scoring_func == SCORING_SIGMOID) ? apply_sigmoid(input) : input; + value = apply_scoring(input); s_topk_value[i] = value; } - topk_sum += - cg::reduce(tile, cuda_cast(value), cg::plus()); + if (renormalize) { + topk_sum += + cg::reduce(tile, cuda_cast(value), cg::plus()); + } } } @@ -660,13 +676,9 @@ __global__ void group_idx_and_topk_idx_kernel( if (case_id < num_tokens) { if (if_proceed_next_topk) { for (int i = lane_id; i < topk; i += WARP_SIZE) { - float value; - if (renormalize) { - value = cuda_cast(s_topk_value[i]) / topk_sum * - routed_scaling_factor; - } else { - value = cuda_cast(s_topk_value[i]) * routed_scaling_factor; - } + float base = cuda_cast(s_topk_value[i]); + float value = renormalize ? (base / topk_sum * routed_scaling_factor) + : (base * routed_scaling_factor); topk_indices[i] = s_topk_idx[i]; topk_values[i] = value; } @@ -684,6 +696,45 @@ __global__ void group_idx_and_topk_idx_kernel( #endif } +template +inline void launch_group_idx_and_topk_kernel( + cudaLaunchConfig_t const& config, T* scores, T* group_scores, + float* topk_values, IdxT* topk_indices, T const* bias, + int64_t const num_tokens, int64_t const n_group, int64_t const topk_group, + int64_t const topk, int64_t const num_experts, + int64_t const num_experts_per_group, bool const renormalize, + double const routed_scaling_factor) { + auto launch = [&](auto* kernel_instance2) { + cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores, + topk_values, topk_indices, bias, num_tokens, n_group, + topk_group, topk, num_experts, num_experts_per_group, + renormalize, routed_scaling_factor); + }; + + switch (n_group) { + case 4: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + case 8: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + case 16: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + case 32: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + default: { + launch(&group_idx_and_topk_idx_kernel); + break; + } + } +} + template void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, IdxT* topk_indices, T const* bias, int64_t const num_tokens, @@ -694,7 +745,6 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, cudaStream_t const stream = 0) { int64_t num_cases = num_tokens * n_group; int64_t topk_with_k2_num_blocks = (num_cases - 1) / NUM_WARPS_PER_BLOCK + 1; - auto* kernel_instance1 = &topk_with_k2_kernel; cudaLaunchConfig_t config; config.gridDim = topk_with_k2_num_blocks; config.blockDim = BLOCK_SIZE; @@ -705,16 +755,33 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; config.numAttrs = 1; config.attrs = attrs; - cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias, - num_tokens, num_cases, n_group, num_experts / n_group, - scoring_func); + auto const sf = static_cast(scoring_func); + int64_t const num_experts_per_group = num_experts / n_group; + auto launch_topk_with_k2 = [&](auto* kernel_instance1) { + cudaLaunchKernelEx(&config, kernel_instance1, group_scores, scores, bias, + num_tokens, num_cases, n_group, num_experts_per_group); + }; + switch (sf) { + case SCORING_NONE: { + auto* kernel_instance1 = &topk_with_k2_kernel; + launch_topk_with_k2(kernel_instance1); + break; + } + case SCORING_SIGMOID: { + auto* kernel_instance1 = &topk_with_k2_kernel; + launch_topk_with_k2(kernel_instance1); + break; + } + default: + // should be guarded by higher level checks. + TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc"); + } int64_t topk_with_k_group_num_blocks = (num_tokens - 1) / NUM_WARPS_PER_BLOCK + 1; size_t dynamic_smem_in_bytes = warp_topk::calc_smem_size_for_block_wide(NUM_WARPS_PER_BLOCK, topk); - auto* kernel_instance2 = &group_idx_and_topk_idx_kernel; config.gridDim = topk_with_k_group_num_blocks; config.blockDim = BLOCK_SIZE; config.dynamicSmemBytes = dynamic_smem_in_bytes; @@ -723,10 +790,24 @@ void invokeNoAuxTc(T* scores, T* group_scores, float* topk_values, attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl; config.numAttrs = 1; config.attrs = attrs; - cudaLaunchKernelEx(&config, kernel_instance2, scores, group_scores, - topk_values, topk_indices, bias, num_tokens, n_group, - topk_group, topk, num_experts, num_experts / n_group, - renormalize, routed_scaling_factor, scoring_func); + switch (sf) { + case SCORING_NONE: { + launch_group_idx_and_topk_kernel( + config, scores, group_scores, topk_values, topk_indices, bias, + num_tokens, n_group, topk_group, topk, num_experts, + num_experts_per_group, renormalize, routed_scaling_factor); + break; + } + case SCORING_SIGMOID: { + launch_group_idx_and_topk_kernel( + config, scores, group_scores, topk_values, topk_indices, bias, + num_tokens, n_group, topk_group, topk, num_experts, + num_experts_per_group, renormalize, routed_scaling_factor); + break; + } + default: + TORCH_CHECK(false, "Unsupported scoring_func in invokeNoAuxTc"); + } } #define INSTANTIATE_NOAUX_TC(T, IdxT) \ diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index ddcdcc38b4fea..5c9e474024082 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -14,7 +14,6 @@ namespace vllm { namespace moe { - namespace batched_moe_align_block_size { // Note num_threads needs to be 1024 for BlockScan Reduction in the kernel. @@ -80,23 +79,30 @@ __global__ void batched_moe_align_block_size_kernel( } // namespace batched_moe_align_block_size template -__global__ void moe_align_block_size_kernel( +__device__ void _moe_align_block_size( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, int32_t* __restrict__ total_tokens_post_pad, int32_t* __restrict__ expert_map, int32_t num_experts, int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded, - bool has_expert_map) { + int32_t max_num_m_blocks, int32_t model_offset, int32_t inactive_expert_id, + int32_t topk_num, int32_t* token_mask, bool has_expert_map) { extern __shared__ int32_t shared_counts[]; - // Use a separate threadblock to fill sorted_token_ids. + // Compute input buffer offsets. Typically these will all be 0, except when + // using Multi LoRA. + int sorted_token_ids_offset = max_num_tokens_padded * model_offset; + int expert_ids_offset = max_num_m_blocks * model_offset; + int cumsum_offset = (num_experts + 1) * model_offset; + + // Use separate threadblocks to fill sorted_token_ids. // This is safe since the current kernel does not use sorted_token_ids. - if (blockIdx.x == 1) { + if (blockIdx.x % 2) { // Initialize sorted_token_ids with numel for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { - sorted_token_ids[it] = numel; + sorted_token_ids[sorted_token_ids_offset + it] = numel; } return; } @@ -127,7 +133,9 @@ __global__ void moe_align_block_size_kernel( } int warp_idx = expert_id / experts_per_warp; int expert_offset = expert_id % experts_per_warp; - atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1); + int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num]; + atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], + mask); } __syncthreads(); @@ -148,77 +156,44 @@ __global__ void moe_align_block_size_kernel( int cumsum_val; BlockScan(temp_storage).ExclusiveSum(expert_count, cumsum_val); if (expert_id <= num_experts) { - cumsum[expert_id] = cumsum_val; + cumsum[cumsum_offset + expert_id] = cumsum_val; } if (expert_id == num_experts) { - *total_tokens_post_pad = cumsum_val; + total_tokens_post_pad[model_offset] = cumsum_val; } __syncthreads(); if (threadIdx.x < num_experts) { - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[i / block_size] = threadIdx.x; + for (int i = cumsum[cumsum_offset + threadIdx.x]; + i < cumsum[cumsum_offset + threadIdx.x + 1]; i += block_size) { + expert_ids[expert_ids_offset + i / block_size] = threadIdx.x; } } // Fill remaining expert_ids with 0 - const size_t fill_start_idx = cumsum[num_experts] / block_size + threadIdx.x; - const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size); - for (size_t i = fill_start_idx; i < expert_ids_size; i += blockDim.x) { - expert_ids[i] = 0; - } -} - -template -__global__ void count_and_sort_expert_tokens_kernel( - const scalar_t* __restrict__ topk_ids, - int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, - int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, - bool has_expert_map) { - const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const size_t stride = blockDim.x * gridDim.x; - - for (size_t i = tid; i < numel; i += stride) { - int32_t expert_id = topk_ids[i]; - if (expert_id >= num_experts) { - continue; - } - if (has_expert_map) { - expert_id = expert_map[expert_id]; - // filter invalid experts - if (expert_id == -1) continue; - } - int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1); - sorted_token_ids[rank_post_pad] = i; - } -} - -template -__global__ void moe_sum_kernel( - scalar_t* __restrict__ out, // [..., d] - const scalar_t* __restrict__ input, // [..., topk, d] - const int d) { - const int64_t token_idx = blockIdx.x; - for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { - scalar_t x = 0.0; -#pragma unroll - for (int k = 0; k < TOPK; ++k) { - x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]); - } - out[token_idx * d + idx] = x; + const size_t fill_start_idx = + cumsum[cumsum_offset + num_experts] / block_size + threadIdx.x; + for (size_t i = fill_start_idx; i < max_num_m_blocks; i += blockDim.x) { + expert_ids[expert_ids_offset + i] = inactive_expert_id; } } template -__global__ void moe_align_block_size_small_batch_expert_kernel( +__device__ void _moe_align_block_size_small_batch_expert( const scalar_t* __restrict__ topk_ids, int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, int32_t* __restrict__ total_tokens_post_pad, int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size, - size_t numel, int32_t max_num_tokens_padded, bool has_expert_map) { + size_t numel, int32_t max_num_tokens_padded, int32_t max_num_m_blocks, + int32_t inactive_expert_id, int32_t model_offset, int32_t topk_num, + int32_t* token_mask, bool has_expert_map) { + // Compute input buffer offsets. Typically these will all be 0, except when + // using Multi LoRA. + int sorted_token_ids_offset = max_num_tokens_padded * model_offset; + int expert_ids_offset = max_num_m_blocks * model_offset; + // Use an additional group of threads to fill sorted_token_ids. // Since the current kernel will use sorted_token_ids afterward, // we fill sorted_token_ids within the same threadblock to make @@ -227,7 +202,7 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( // Initialize sorted_token_ids with numel for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += fill_threads) { - sorted_token_ids[it] = numel; + sorted_token_ids[sorted_token_ids_offset + it] = numel; } // Three __syncthreads() corresponding to the other threads __syncthreads(); @@ -254,7 +229,8 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( // filter invalid expert if (expert_id == -1) continue; } - ++tokens_cnts[(tid + 1) * num_experts + expert_id]; + int mask = token_mask == nullptr ? 1 : token_mask[i / topk_num]; + tokens_cnts[(tid + 1) * num_experts + expert_id] += mask; } __syncthreads(); @@ -277,22 +253,22 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( CEILDIV(tokens_cnts[stride * num_experts + i - 1], block_size) * block_size; } - *total_tokens_post_pad = static_cast(cumsum[num_experts]); + total_tokens_post_pad[model_offset] = + static_cast(cumsum[num_experts]); } __syncthreads(); if (tid < num_experts) { for (int i = cumsum[tid]; i < cumsum[tid + 1]; i += block_size) { - expert_ids[i / block_size] = tid; + expert_ids[expert_ids_offset + i / block_size] = tid; } } // Fill remaining expert_ids with 0 const size_t fill_start_idx = cumsum[num_experts] / block_size + tid; - const size_t expert_ids_size = CEILDIV(max_num_tokens_padded, block_size); - for (size_t i = fill_start_idx; i < expert_ids_size; i += stride) { - expert_ids[i] = 0; + for (size_t i = fill_start_idx; i < max_num_m_blocks; i += stride) { + expert_ids[expert_ids_offset + i] = inactive_expert_id; } for (size_t i = tid; i < numel; i += stride) { @@ -304,11 +280,195 @@ __global__ void moe_align_block_size_small_batch_expert_kernel( } int32_t rank_post_pad = tokens_cnts[tid * num_experts + expert_id] + cumsum[expert_id]; - sorted_token_ids[rank_post_pad] = i; - ++tokens_cnts[tid * num_experts + expert_id]; + + if (token_mask == nullptr || token_mask[i / topk_num]) { + sorted_token_ids[sorted_token_ids_offset + rank_post_pad] = i; + ++tokens_cnts[tid * num_experts + expert_id]; + } } } +template +__device__ void _count_and_sort_expert_tokens( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, + int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, + int32_t max_num_tokens_padded, int32_t* __restrict__ token_mask, + int32_t model_offset, int32_t topk_num, bool has_expert_map) { + const size_t tid = blockIdx.y * blockDim.x + threadIdx.x; + const size_t stride = blockDim.x * gridDim.y; + + for (size_t i = tid; i < numel; i += stride) { + int32_t expert_id = topk_ids[i]; + if (expert_id >= num_experts) { + continue; + } + + if (has_expert_map) { + expert_id = expert_map[expert_id]; + // filter invalid experts + if (expert_id == -1) continue; + } + + if (token_mask == nullptr || token_mask[i / topk_num]) { + int32_t rank_post_pad = atomicAdd( + &cumsum_buffer[(model_offset * (num_experts + 1)) + expert_id], 1); + sorted_token_ids[max_num_tokens_padded * model_offset + rank_post_pad] = + i; + } + } +} + +template +__global__ void moe_align_block_size_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, + int32_t* __restrict__ total_tokens_post_pad, + int32_t* __restrict__ expert_map, int32_t num_experts, + int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size, + size_t numel, int32_t* __restrict__ cumsum, int32_t max_num_tokens_padded, + int32_t topk_num, bool has_expert_map) { + _moe_align_block_size( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, padded_num_experts, experts_per_warp, block_size, numel, + cumsum, max_num_tokens_padded, CEILDIV(max_num_tokens_padded, block_size), + 0, 0, topk_num, nullptr, has_expert_map); +} + +template +__global__ void count_and_sort_expert_tokens_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, + int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, + int32_t max_num_tokens_padded, int32_t topk_num, bool has_expert_map) { + _count_and_sort_expert_tokens( + topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts, + max_num_tokens_padded, nullptr, 0, topk_num, has_expert_map); +} + +template +__global__ void moe_sum_kernel( + scalar_t* __restrict__ out, // [..., d] + const scalar_t* __restrict__ input, // [..., topk, d] + const int d) { + const int64_t token_idx = blockIdx.x; + for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { + scalar_t x = 0.0; +#pragma unroll + for (int k = 0; k < TOPK; ++k) { + x += VLLM_LDG(&input[token_idx * TOPK * d + k * d + idx]); + } + out[token_idx * d + idx] = x; + } +} + +template +__global__ void moe_align_block_size_small_batch_expert_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, + int32_t* __restrict__ total_tokens_post_pad, + int32_t* __restrict__ expert_map, int32_t num_experts, int32_t block_size, + size_t numel, int32_t max_num_tokens_padded, int32_t topk_num, + bool has_expert_map) { + _moe_align_block_size_small_batch_expert( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, block_size, numel, max_num_tokens_padded, + CEILDIV(max_num_tokens_padded, block_size), 0, 0, topk_num, nullptr, + has_expert_map); +} + +template +__global__ void moe_lora_align_block_size_kernel( + scalar_t* __restrict__ topk_ids, int32_t* __restrict__ token_lora_mapping, + int64_t block_size, int32_t* __restrict__ expert_map, int num_experts, + int max_loras, size_t numel, int max_num_tokens_padded, + int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids, + int32_t* __restrict__ expert_ids, int32_t topk_num, + int32_t* total_tokens_post_pad, int32_t* adapter_enabled, + int32_t* __restrict__ cumsum, int32_t experts_per_warp, + int32_t padded_num_experts, int32_t* lora_ids, + int32_t* __restrict__ token_mask, bool has_expert_map) { + int lora_idx = blockIdx.x / 2; + int lora_id = lora_ids[lora_idx]; + if (lora_id == -1 || adapter_enabled[lora_id] == 0) { + return; + } + + // Populate the token_mask based on the token-LoRA mapping + int num_tokens = numel / topk_num; + if (threadIdx.x == 0) { + total_tokens_post_pad[lora_id] = 0; + + for (int i = 0; i < num_tokens; i++) { + token_mask[(lora_id * num_tokens) + i] = + (int)token_lora_mapping[i] == lora_id; + } + } + + __syncthreads(); + + _moe_align_block_size( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, padded_num_experts, experts_per_warp, block_size, numel, + cumsum, max_num_tokens_padded, max_num_m_blocks, lora_id, -1, topk_num, + &token_mask[(lora_id * num_tokens)], has_expert_map); +} + +template +__global__ void lora_count_and_sort_expert_tokens_kernel( + const scalar_t* __restrict__ topk_ids, + int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer, + int32_t* __restrict__ expert_map, size_t numel, int32_t num_experts, + int32_t max_num_tokens_padded, int32_t topk_num, int32_t* token_mask, + int32_t* lora_ids, bool has_expert_map) { + int lora_idx = blockIdx.x; + int lora_id = lora_ids[lora_idx]; + if (lora_id == -1) { + return; + } + + int num_tokens = numel / topk_num; + + _count_and_sort_expert_tokens( + topk_ids, sorted_token_ids, cumsum_buffer, expert_map, numel, num_experts, + max_num_tokens_padded, &token_mask[(lora_id * num_tokens)], lora_id, + topk_num, has_expert_map); +} + +template +__global__ void moe_lora_align_block_size_small_batch_expert_kernel( + scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping, + int64_t block_size, int32_t* __restrict__ expert_map, int num_experts, + int max_loras, size_t numel, int max_num_tokens_padded, + int max_num_m_blocks, int32_t* __restrict__ sorted_token_ids, + int32_t* __restrict__ expert_ids, int topk_num, + int32_t* total_tokens_post_pad, int32_t* adapter_enabled, int32_t* lora_ids, + int32_t* token_mask, bool has_expert_map) { + int lora_idx = blockIdx.x; + int lora_id = lora_ids[lora_idx]; + if (lora_id == -1 || adapter_enabled[lora_id] == 0) { + return; + } + + int num_tokens = numel / topk_num; + if (threadIdx.x == 0) { + total_tokens_post_pad[lora_id] = 0; + + for (int i = 0; i < num_tokens; i++) { + token_mask[(lora_id * num_tokens) + i] = + (int)token_lora_mapping[i] == lora_id; + } + } + + __syncthreads(); + + _moe_align_block_size_small_batch_expert( + topk_ids, sorted_token_ids, expert_ids, total_tokens_post_pad, expert_map, + num_experts, block_size, numel, max_num_tokens_padded, max_num_m_blocks, + -1, lora_id, topk_num, &token_mask[(lora_id * num_tokens)], + has_expert_map); +} + } // namespace moe } // namespace vllm @@ -365,7 +525,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), expert_map.data_ptr(), num_experts, block_size, - topk_ids.numel(), sorted_token_ids.size(0), has_expert_map); + topk_ids.numel(), sorted_token_ids.size(0), topk_ids.size(1), + has_expert_map); } else { torch::Tensor cumsum_buffer = torch::empty({num_experts + 1}, options_int); @@ -386,21 +547,23 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, expert_map.data_ptr(), num_experts, padded_num_experts, experts_per_warp, block_size, topk_ids.numel(), cumsum_buffer.data_ptr(), sorted_token_ids.size(0), - has_expert_map); + topk_ids.size(1), has_expert_map); const int block_threads = std::min(256, (int)threads); const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads; const int max_blocks = 65535; const int actual_blocks = std::min(num_blocks, max_blocks); + dim3 gridDims(1, actual_blocks); auto sort_kernel = vllm::moe::count_and_sort_expert_tokens_kernel; - sort_kernel<<>>( + sort_kernel<<>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), cumsum_buffer.data_ptr(), expert_map.data_ptr(), - topk_ids.numel(), num_experts, has_expert_map); + topk_ids.numel(), num_experts, sorted_token_ids.size(0), + topk_ids.size(1), has_expert_map); } }); } @@ -474,3 +637,123 @@ void moe_sum(torch::Tensor& input, // [num_tokens, topk, hidden_size] break; } } + +void moe_lora_align_block_size( + torch::Tensor topk_ids, torch::Tensor token_lora_mapping, + int64_t num_experts, int64_t block_size, int64_t max_loras, + int64_t max_num_tokens_padded, int64_t max_num_m_blocks, + torch::Tensor sorted_token_ids, torch::Tensor expert_ids, + torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, + torch::Tensor lora_ids, std::optional maybe_expert_map) { + const int topk_num = topk_ids.size(1); + + TORCH_CHECK(block_size > 0, "block_size should be greater than 0. "); + + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + int64_t padded_num_experts = + ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE; + + // BlockScan uses 1024 threads and assigns one thread per expert. + TORCH_CHECK(padded_num_experts < 1024, + "padded_num_experts must be less than 1024"); + + auto options_int = + torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + torch::Tensor token_mask = + torch::empty({max_loras * topk_ids.size(0)}, options_int); + bool has_expert_map = maybe_expert_map.has_value(); + torch::Tensor expert_map; + if (has_expert_map) { + expert_map = maybe_expert_map.value(); + } else { + expert_map = torch::empty({0}, options_int); + } + + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] { + bool small_batch_expert_mode = + (topk_ids.numel() < 1024) && (num_experts <= 64); + + if (small_batch_expert_mode) { + const int32_t num_thread = max((int32_t)num_experts, 128); + const int32_t shared_mem = + (num_thread + 1) * num_experts * sizeof(int32_t) + + (num_experts + 1) * sizeof(int32_t); + if (shared_mem > device_max_shared_mem) { + TORCH_CHECK(false, "Shared memory usage exceeds device limit."); + } + + // threadIdx.x >= fill_threads: counting experts and aligning + // threadIdx.x < fill_threads: filling sorted_token_ids + constexpr int32_t fill_threads = 256; + + dim3 blockDim(num_thread + fill_threads); + auto kernel = + vllm::moe::moe_lora_align_block_size_small_batch_expert_kernel< + scalar_t, fill_threads>; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem)); + kernel<<>>( + topk_ids.data_ptr(), + token_lora_mapping.data_ptr(), block_size, + expert_map.data_ptr(), num_experts, max_loras, + topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks, + sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), topk_num, + num_tokens_post_pad.data_ptr(), + adapter_enabled.data_ptr(), lora_ids.data_ptr(), + token_mask.data_ptr(), has_expert_map); + } else { + int num_thread = 1024; + dim3 blockDim(num_thread); + size_t num_warps = CEILDIV(padded_num_experts, WARP_SIZE); + + size_t shared_mem_size = num_warps * WARP_SIZE * sizeof(int32_t); + + // cumsum buffer + torch::Tensor cumsum = + torch::zeros({max_loras * (num_experts + 1)}, options_int); + + auto align_kernel = + vllm::moe::moe_lora_align_block_size_kernel; + + // launch two threadblocks for each lora + // blockIdx.x % 2 == 0: counting experts and aligning + // blockIdx.x % 2 == 1: filling sorted_token_ids + align_kernel<<>>( + topk_ids.data_ptr(), + token_lora_mapping.data_ptr(), block_size, + expert_map.data_ptr(), num_experts, max_loras, + topk_ids.numel(), max_num_tokens_padded, max_num_m_blocks, + sorted_token_ids.data_ptr(), + expert_ids.data_ptr(), topk_num, + num_tokens_post_pad.data_ptr(), + adapter_enabled.data_ptr(), cumsum.data_ptr(), + WARP_SIZE, padded_num_experts, lora_ids.data_ptr(), + token_mask.data_ptr(), has_expert_map); + + const int block_threads = std::min(256, (int)num_thread); + const int num_blocks = + (topk_ids.numel() + block_threads - 1) / block_threads; + + const int max_blocks = 65535; + const int actual_blocks = std::min(num_blocks, max_blocks); + + dim3 gridDims(max_loras, actual_blocks); + auto sort_kernel = + vllm::moe::lora_count_and_sort_expert_tokens_kernel; + + sort_kernel<<>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), cumsum.data_ptr(), + expert_map.data_ptr(), topk_ids.numel(), num_experts, + max_num_tokens_padded, topk_num, token_mask.data_ptr(), + lora_ids.data_ptr(), has_expert_map); + } + }); +} \ No newline at end of file diff --git a/csrc/moe/moe_lora_align_sum_kernels.cu b/csrc/moe/moe_lora_align_sum_kernels.cu deleted file mode 100644 index 360f1312cf579..0000000000000 --- a/csrc/moe/moe_lora_align_sum_kernels.cu +++ /dev/null @@ -1,174 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "../cuda_compat.h" -#include "../dispatch_utils.h" -#include "core/math.hpp" - -namespace { - -__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, - int32_t col) { - return row * total_col + col; -} - -} // namespace - -// TODO: Refactor common parts with moe_align_sum_kernels -template -__global__ void moe_lora_align_sum_kernel( - scalar_t* __restrict__ topk_ids, int32_t* token_lora_mapping, - int64_t block_size, int num_experts, int max_loras, size_t numel, - int max_num_tokens_padded, int max_num_m_blocks, - int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids, - int topk_num, int32_t* total_tokens_post_pad, int32_t* adapter_enabled, - int32_t* lora_ids) { - const size_t tokens_per_thread = div_ceil(numel, blockDim.x); - const size_t start_idx = threadIdx.x * tokens_per_thread; - - int lora_idx = blockIdx.x; - int lora_id = lora_ids[lora_idx]; - if (lora_id == -1 || adapter_enabled[lora_id] == 0) { - return; - } - extern __shared__ int32_t shared_mem[]; - int32_t* cumsum = shared_mem; - token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + num_experts + 1); - - // Initialize sorted_token_ids with numel - for (size_t it = threadIdx.x; it < max_num_tokens_padded; it += blockDim.x) { - sorted_token_ids[lora_id * max_num_tokens_padded + it] = numel; - } - - // Initialize expert_ids with -1 - for (size_t it = threadIdx.x; it < max_num_m_blocks; it += blockDim.x) { - expert_ids[lora_id * max_num_m_blocks + it] = -1; - } - - // Initialize total_tokens_post_pad with 0 - if (threadIdx.x == 0) { - total_tokens_post_pad[lora_id] = 0; - } - - for (int i = 0; i < num_experts; ++i) { - tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; - } - - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int mask = token_lora_mapping[i / topk_num] == lora_id; - int idx = index(num_experts, threadIdx.x + 1, topk_ids[i]); - tokens_cnts[idx] += mask; - } - - __syncthreads(); - - // For each expert we accumulate the token counts from the different threads. - if (threadIdx.x < num_experts) { - tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0; - for (int i = 1; i <= blockDim.x; ++i) { - tokens_cnts[index(num_experts, i, threadIdx.x)] += - tokens_cnts[index(num_experts, i - 1, threadIdx.x)]; - } - } - - __syncthreads(); - - // We accumulate the token counts of all experts in thread 0. - if (threadIdx.x == 0) { - cumsum[0] = 0; - for (int i = 1; i <= num_experts; ++i) { - cumsum[i] = cumsum[i - 1] + - div_ceil(tokens_cnts[index(num_experts, blockDim.x, i - 1)], - block_size) * - block_size; - } - total_tokens_post_pad[lora_id] = static_cast(cumsum[num_experts]); - } - - __syncthreads(); - - /** - * For each expert, each thread processes the tokens of the corresponding - * blocks and stores the corresponding expert_id for each block. - */ - if (threadIdx.x < num_experts) { - for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1]; - i += block_size) { - expert_ids[index(max_num_m_blocks, lora_id, i / block_size)] = - threadIdx.x; - } - } - - for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) { - int32_t expert_id = topk_ids[i]; - /** The cumsum[expert_id] stores the starting index of the tokens that the - * expert with expert_id needs to process, and - * tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens - * processed by the expert with expert_id within the current thread's token - * shard. - */ - int32_t rank_post_pad = - tokens_cnts[index(num_experts, threadIdx.x, expert_id)] + - cumsum[expert_id]; - - int mask = (int)token_lora_mapping[i / topk_num] == lora_id; - atomicAdd( - &sorted_token_ids[index(max_num_tokens_padded, lora_id, rank_post_pad)], - (i - numel) * mask); - tokens_cnts[index(num_experts, threadIdx.x, expert_id)] += mask; - } -} - -void moe_lora_align_block_size( - torch::Tensor topk_ids, torch::Tensor token_lora_mapping, - int64_t num_experts, int64_t block_size, int64_t max_loras, - int64_t max_num_tokens_padded, int64_t max_num_m_blocks, - torch::Tensor sorted_token_ids, torch::Tensor expert_ids, - torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, - torch::Tensor lora_ids) { - const int topk_num = topk_ids.size(1); - - TORCH_CHECK(block_size > 0, "block_size should be greater than 0. "); - - int device_max_shared_mem; - auto dev = topk_ids.get_device(); - cudaDeviceGetAttribute(&device_max_shared_mem, - cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); - const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - - const int32_t num_thread = max((int32_t)num_experts, 128); // WARP_SIZE, - TORCH_CHECK(num_thread <= 1024, - "num_thread must be less than 1024, " - "and fallback is not implemented yet."); - const int32_t shared_mem = (num_thread + 1) * num_experts * sizeof(int32_t) + - (num_experts + 1) * sizeof(int32_t); - - if (shared_mem > device_max_shared_mem) { - TORCH_CHECK(false, - "Shared memory usage exceeds device limit, and global memory " - "fallback is not implemented yet."); - } - - VLLM_DISPATCH_INTEGRAL_TYPES( - topk_ids.scalar_type(), "moe_lora_align_sum_kernel", [&] { - dim3 blockDim(num_thread); - auto kernel = moe_lora_align_sum_kernel; - AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem)); - kernel<<>>( - topk_ids.data_ptr(), - token_lora_mapping.data_ptr(), block_size, num_experts, - max_loras, topk_ids.numel(), max_num_tokens_padded, - max_num_m_blocks, sorted_token_ids.data_ptr(), - expert_ids.data_ptr(), topk_num, - num_tokens_post_pad.data_ptr(), - adapter_enabled.data_ptr(), lora_ids.data_ptr()); - }); -} \ No newline at end of file diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h index 4c7accf03440a..337dcc50b079e 100644 --- a/csrc/moe/moe_ops.h +++ b/csrc/moe/moe_ops.h @@ -27,7 +27,7 @@ void moe_lora_align_block_size( int64_t max_num_tokens_padded, int64_t max_num_m_blocks, torch::Tensor sorted_token_ids, torch::Tensor expert_ids, torch::Tensor num_tokens_post_pad, torch::Tensor adapter_enabled, - torch::Tensor lora_ids); + torch::Tensor lora_ids, std::optional maybe_expert_map); #ifndef USE_ROCM torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, torch::Tensor b_qweight, torch::Tensor b_scales, diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp index fca57c31caf8e..779ad70ad1e09 100644 --- a/csrc/moe/torch_bindings.cpp +++ b/csrc/moe/torch_bindings.cpp @@ -47,7 +47,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { " Tensor !experts_ids," " Tensor !num_tokens_post_pad," " Tensor !adapter_enabled," - " Tensor !lora_ids) -> () "); + " Tensor !lora_ids," + " Tensor? maybe_expert_map) -> () "); m.impl("moe_lora_align_block_size", torch::kCUDA, &moe_lora_align_block_size); #ifndef USE_ROCM diff --git a/csrc/ops.h b/csrc/ops.h index 5fce3a1a3fea3..37e3aaf7499d5 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -262,7 +262,8 @@ void get_cutlass_moe_mm_data( void get_cutlass_moe_mm_problem_sizes( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets); + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt); void get_cutlass_pplx_moe_mm_data(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, diff --git a/csrc/quantization/cutlass_w4a8/get_group_starts.cuh b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh new file mode 100644 index 0000000000000..fec142d0d87a1 --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/get_group_starts.cuh @@ -0,0 +1,104 @@ +// see csrc/quantization/w8a8/cutlass/moe/get_group_starts.cuh +#pragma once + +#include +#include +#include + +#include "core/scalar_type.hpp" +#include "cutlass/bfloat16.h" +#include "cutlass/float8.h" + +// ElementB is int32 (packed int4) +// ElementGroupScale is cutlass::Array (packed fp8) +template +__global__ void get_group_gemm_starts( + int64_t* expert_offsets, ElementA** a_offsets, ElementB** b_offsets, + ElementC** out_offsets, ElementAccumulator** a_scales_offsets, + ElementAccumulator** b_scales_offsets, + ElementGroupScale** b_group_scales_offsets, ElementA* a_base_as_int, + ElementB* b_base_as_int, ElementC* out_base_as_int, + ElementAccumulator* a_scales_base_as_int, + ElementAccumulator* b_scales_base_as_int, + ElementGroupScale* b_group_scales_base_as_int, int64_t n, int64_t k, + int64_t scale_k) { + int expert_id = threadIdx.x; + + int64_t expert_offset = expert_offsets[expert_id]; + + // same as w8a8 + a_offsets[expert_id] = a_base_as_int + expert_offset * k; + out_offsets[expert_id] = out_base_as_int + expert_offset * n; + a_scales_offsets[expert_id] = a_scales_base_as_int + expert_offset; + b_scales_offsets[expert_id] = b_scales_base_as_int + (n * expert_id); + + // w4a8 specific + constexpr int pack_factor = 8; // pack 8 int4 into int32 + b_offsets[expert_id] = b_base_as_int + (expert_id * k * n / pack_factor); + b_group_scales_offsets[expert_id] = + b_group_scales_base_as_int + (expert_id * scale_k * n); +} + +#define __CALL_GET_STARTS_KERNEL(TENSOR_C_TYPE, C_TYPE) \ + else if (out_tensors.dtype() == TENSOR_C_TYPE) { \ + get_group_gemm_starts> \ + <<<1, num_experts, 0, stream>>>( \ + static_cast(expert_offsets.data_ptr()), \ + static_cast(a_ptrs.data_ptr()), \ + static_cast(b_ptrs.data_ptr()), \ + static_cast(out_ptrs.data_ptr()), \ + static_cast(a_scales_ptrs.data_ptr()), \ + static_cast(b_scales_ptrs.data_ptr()), \ + static_cast**>( \ + b_group_scales_ptrs.data_ptr()), \ + static_cast(a_tensors.data_ptr()), \ + static_cast(b_tensors.data_ptr()), \ + static_cast(out_tensors.data_ptr()), \ + static_cast(a_scales.data_ptr()), \ + static_cast(b_scales.data_ptr()), \ + static_cast*>( \ + b_group_scales.data_ptr()), \ + n, k, scale_k); \ + } + +namespace { + +void run_get_group_gemm_starts( + torch::Tensor const& expert_offsets, torch::Tensor& a_ptrs, + torch::Tensor& b_ptrs, torch::Tensor& out_ptrs, + torch::Tensor& a_scales_ptrs, torch::Tensor& b_scales_ptrs, + torch::Tensor& b_group_scales_ptrs, torch::Tensor const& a_tensors, + torch::Tensor const& b_tensors, torch::Tensor& out_tensors, + torch::Tensor const& a_scales, torch::Tensor const& b_scales, + torch::Tensor const& b_group_scales, const int64_t b_group_size) { + TORCH_CHECK(a_tensors.dtype() == torch::kFloat8_e4m3fn); + TORCH_CHECK(b_tensors.dtype() == torch::kInt32); // int4 8x packed into int32 + TORCH_CHECK(a_scales.dtype() == torch::kFloat32); + TORCH_CHECK(b_scales.dtype() == torch::kFloat32); + TORCH_CHECK(b_group_scales.dtype() == + torch::kFloat8_e4m3fn); // the underlying torch type is e4m3 + TORCH_CHECK(out_tensors.dtype() == + torch::kBFloat16); // only support bf16 for now + // expect int64_t to avoid overflow during offset calculations + TORCH_CHECK(expert_offsets.dtype() == torch::kInt64); + + int num_experts = static_cast(expert_offsets.size(0)); + // logical k, n + int64_t n = out_tensors.size(1); + int64_t k = a_tensors.size(1); + int64_t scale_k = cutlass::ceil_div(k, b_group_size); + + auto stream = at::cuda::getCurrentCUDAStream(a_tensors.device().index()); + + if (false) { + } + __CALL_GET_STARTS_KERNEL(torch::kBFloat16, cutlass::bfloat16_t) + __CALL_GET_STARTS_KERNEL(torch::kFloat16, half) + else { + TORCH_CHECK(false, "Invalid output type (must be float16 or bfloat16)"); + } +} + +} // namespace \ No newline at end of file diff --git a/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu new file mode 100644 index 0000000000000..4b425790dbac7 --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/w4a8_grouped_mm_entry.cu @@ -0,0 +1,483 @@ +#include +#include + +#include "cutlass/cutlass.h" + +#include "cute/tensor.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/group_array_problem_shape.hpp" +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" + +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/mixed_dtype_utils.hpp" + +// vllm includes +#include +#include +#include +#include "cutlass_extensions/torch_utils.hpp" +#include "cutlass_extensions/common.hpp" + +#include "core/registration.h" +#include "get_group_starts.cuh" +#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp" +#include "w4a8_utils.cuh" + +namespace vllm::cutlass_w4a8_moe { + +using namespace cute; + +// ------------------------------------------------------------------------------------- +// Static configuration shared across all instantiations +// ------------------------------------------------------------------------------------- +using ProblemShape = + cutlass::gemm::GroupProblemShape>; // per + // group +using MmaType = cutlass::float_e4m3_t; +using QuantType = cutlass::int4b_t; + +constexpr int TileShapeK = 128 * 8 / sizeof_bits::value; +static int constexpr PackFactor = 8; // 8 int4 packed into int32 + +// A matrix configuration +using ElementA = MmaType; +using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand +constexpr int AlignmentA = + 128 / + cutlass::sizeof_bits::value; // Alignment of A matrix in units of + // elements (up to 16 bytes) + +// B matrix configuration +using ElementB = QuantType; // Element type for B matrix operand +using LayoutB = + cutlass::layout::ColumnMajor; // Layout type for B matrix operand +constexpr int AlignmentB = + 128 / cutlass::sizeof_bits< + ElementB>::value; // Memory access granularity/alignment of B + // matrix in units of elements (up to 16 bytes) + +// This example manually swaps and transposes, so keep transpose of input +// layouts +using LayoutA_Transpose = + typename cutlass::layout::LayoutTranspose::type; +using LayoutB_Transpose = + typename cutlass::layout::LayoutTranspose::type; + +// Need to pass a pointer type to make the 3rd dimension of Stride be _0 +using StrideA = + cute::remove_pointer_t>; +using StrideB = + cute::remove_pointer_t>; + +// Define the CuTe layout for reoredered quantized tensor B +// LayoutAtomQuant places values that will be read by the same thread in +// contiguous locations in global memory. It specifies the reordering within a +// single warp's fragment +using LayoutAtomQuant = + decltype(cutlass::compute_memory_reordering_atom()); +using LayoutB_Reordered = decltype(cute::tile_to_shape( + LayoutAtomQuant{}, Layout>, StrideB>{})); + +using ElementScale = cutlass::float_e4m3_t; +using LayoutScale = cutlass::layout::RowMajor; + +// C/D matrix configuration +using ElementC = + cutlass::bfloat16_t; // Element type for C and D matrix operands +using LayoutC = + cutlass::layout::RowMajor; // Layout type for C and D matrix operands +constexpr int AlignmentC = + 128 / cutlass::sizeof_bits< + ElementC>::value; // Memory access granularity/alignment of C + // matrix in units of elements (up to 16 bytes) + +// D matrix configuration +using ElementD = ElementC; +using LayoutD = LayoutC; +constexpr int AlignmentD = 128 / cutlass::sizeof_bits::value; + +// Core kernel configurations +using ElementAccumulator = float; // Element type for internal accumulation +using ArchTag = cutlass::arch::Sm90; // Tag indicating the minimum SM that + // supports the intended feature +using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag +using StageCountType = + cutlass::gemm::collective::StageCountAuto; // Stage count maximized based + // on the tile size + +// per-channel and per-token scales for epilogue +using ElementSChannel = float; + +template +struct W4A8GroupedGemmKernel { + using TileShape = + decltype(cute::append(TileShape_MN{}, cute::Int{})); + using ClusterShape = ClusterShape_MNK; + + // per-channel, per-token scales epilogue + using ChTokScalesEpilogue = + typename vllm::c3x::ScaledEpilogueArray; + using EVTCompute = typename ChTokScalesEpilogue::EVTCompute; + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder< + ArchTag, OperatorClass, TileShape, ClusterShape, + cutlass::epilogue::collective::EpilogueTileAuto, ElementAccumulator, + ElementSChannel, ElementC, + typename cutlass::layout::LayoutTranspose::type*, AlignmentC, + ElementD, typename cutlass::layout::LayoutTranspose::type*, + AlignmentD, EpilogueSchedule, EVTCompute>::CollectiveOp; + + // =========================================================== MIXED INPUT + // WITH SCALES + // =========================================================================== + // The Scale information must get paired with the operand that will be scaled. + // In this example, B is scaled so we make a tuple of B's information and the + // scale information. + using CollectiveMainloopShuffled = + typename cutlass::gemm::collective::CollectiveBuilder< + ArchTag, OperatorClass, + cute::tuple>, + LayoutB_Reordered*, AlignmentB, ElementA, LayoutA_Transpose*, + AlignmentA, ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + KernelSchedule>::CollectiveOp; + + using GemmKernelShuffled = cutlass::gemm::kernel::GemmUniversal< + ProblemShape, CollectiveMainloopShuffled, CollectiveEpilogue>; + + using GemmShuffled = + cutlass::gemm::device::GemmUniversalAdapter; + + using StrideC = typename GemmKernelShuffled::InternalStrideC; + using StrideD = typename GemmKernelShuffled::InternalStrideD; + + using StrideC_ref = cutlass::detail::TagToStrideC_t; + using StrideD_ref = cutlass::detail::TagToStrideC_t; + using StrideS = typename CollectiveMainloopShuffled::StrideScale; + using StrideS_ref = cutlass::detail::TagToStrideB_t; + + // static asserts for passing in strides/layouts + // pack to 2x int64 + static_assert(sizeof(StrideS) == 2 * sizeof(int64_t)); + // pack to 3xint32, + static_assert(sizeof(LayoutB_Reordered) % sizeof(int32_t) == 0, + "LayoutB_Reordered size must be divisible by 4 bytes"); + + static void grouped_mm( + torch::Tensor& out_tensors, const torch::Tensor& a_tensors, + const torch::Tensor& b_tensors, const torch::Tensor& a_scales, + const torch::Tensor& b_scales, const torch::Tensor& b_group_scales, + const int64_t b_group_size, const torch::Tensor& expert_offsets, + const torch::Tensor& problem_sizes_torch, const torch::Tensor& a_strides, + const torch::Tensor& b_strides, const torch::Tensor& c_strides, + const torch::Tensor& group_scale_strides) { + auto device = a_tensors.device(); + auto device_id = device.index(); + const at::cuda::OptionalCUDAGuard device_guard(device); + auto stream = at::cuda::getCurrentCUDAStream(device_id); + + int num_experts = static_cast(expert_offsets.size(0)); + int n = static_cast(b_tensors.size(1)); + int k = static_cast(b_tensors.size(2)) * PackFactor; + + auto options_int = + torch::TensorOptions().dtype(torch::kInt64).device(device); + torch::Tensor a_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_ptrs = torch::empty(num_experts, options_int); + torch::Tensor out_ptrs = torch::empty(num_experts, options_int); + torch::Tensor a_scales_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_scales_ptrs = torch::empty(num_experts, options_int); + torch::Tensor b_group_scales_ptrs = torch::empty(num_experts, options_int); + + // get the correct offsets to pass to gemm + run_get_group_gemm_starts(expert_offsets, a_ptrs, b_ptrs, out_ptrs, + a_scales_ptrs, b_scales_ptrs, b_group_scales_ptrs, + a_tensors, b_tensors, out_tensors, a_scales, + b_scales, b_group_scales, b_group_size); + + // construct args + using Args = typename GemmShuffled::Arguments; + using MainloopArguments = typename GemmKernelShuffled::MainloopArguments; + using EpilogueArguments = typename GemmKernelShuffled::EpilogueArguments; + Args arguments; + + ProblemShape::UnderlyingProblemShape* problem_sizes_as_shapes = + static_cast( + problem_sizes_torch.data_ptr()); + ProblemShape prob_shape{num_experts, problem_sizes_as_shapes, nullptr}; + + // SwapAB so B operands come first + MainloopArguments mainloop_arguments{ + static_cast(b_ptrs.data_ptr()), + static_cast(b_strides.data_ptr()), + static_cast(a_ptrs.data_ptr()), + static_cast(a_strides.data_ptr()), + static_cast**>( + b_group_scales_ptrs.data_ptr()), + static_cast(group_scale_strides.data_ptr()), + static_cast(b_group_size)}; + + EpilogueArguments epilogue_arguments{ + // since we are doing SwapAB the channel scales comes first, then token + // scales + ChTokScalesEpilogue::prepare_args( // see ScaledEpilogueArray + static_cast( + b_scales_ptrs.data_ptr()), // per-channel + static_cast( + a_scales_ptrs.data_ptr()), // per-token + true, true), + nullptr, // C + static_cast(c_strides.data_ptr()), // C + static_cast(out_ptrs.data_ptr()), // D + static_cast(c_strides.data_ptr()) // D + }; + + static const cutlass::KernelHardwareInfo hw_info{ + device_id, + cutlass::KernelHardwareInfo::query_device_multiprocessor_count( + device_id)}; + + arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, prob_shape, + mainloop_arguments, epilogue_arguments, hw_info}; + + // Allocate workspace + size_t workspace_size = GemmShuffled::get_workspace_size(arguments); + torch::Tensor workspace = + torch::empty(workspace_size, + torch::TensorOptions().dtype(torch::kU8).device(device)); + + // Run GEMM + GemmShuffled gemm; + CUTLASS_CHECK(gemm.can_implement(arguments)); + CUTLASS_CHECK(gemm.initialize(arguments, workspace.data_ptr(), stream)); + CUTLASS_CHECK(gemm.run(stream)); + } +}; + +// ---------------------------------------------------------------------------- +// Kernel instantiations and dispatch logic +// ---------------------------------------------------------------------------- +using Coop = cutlass::gemm::KernelPtrArrayTmaWarpSpecializedCooperative; +using CoopEpi = cutlass::epilogue::PtrArrayTmaWarpSpecializedCooperative; + +// Kernel_TileShape_ClusterShape_Schedule +using Kernel_128x16_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_128x16_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x16_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x16_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x32_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x32_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x64_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x64_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_256x128_1x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_1, _1, _1>, Coop, CoopEpi>; +using Kernel_256x128_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +using Kernel_128x256_2x1x1_Coop = + W4A8GroupedGemmKernel, Shape<_2, _1, _1>, Coop, CoopEpi>; + +void mm_dispatch( + torch::Tensor& out_tensors, const torch::Tensor& a_tensors, + const torch::Tensor& b_tensors, const torch::Tensor& a_scales, + const torch::Tensor& b_scales, const torch::Tensor& b_group_scales, + const int64_t b_group_size, const torch::Tensor& expert_offsets, + const torch::Tensor& problem_sizes, const torch::Tensor& a_strides, + const torch::Tensor& b_strides, const torch::Tensor& c_strides, + const torch::Tensor& group_scale_strides, const std::string& schedule) { + if (schedule == "Kernel_128x16_1x1x1_Coop") { + Kernel_128x16_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_128x16_2x1x1_Coop") { + Kernel_128x16_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x16_1x1x1_Coop") { + Kernel_256x16_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x16_2x1x1_Coop") { + Kernel_256x16_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x32_1x1x1_Coop") { + Kernel_256x32_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x32_2x1x1_Coop") { + Kernel_256x32_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x64_1x1x1_Coop") { + Kernel_256x64_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x64_2x1x1_Coop") { + Kernel_256x64_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x128_1x1x1_Coop") { + Kernel_256x128_1x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_256x128_2x1x1_Coop") { + Kernel_256x128_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else if (schedule == "Kernel_128x256_2x1x1_Coop") { + Kernel_128x256_2x1x1_Coop::grouped_mm( + out_tensors, a_tensors, b_tensors, a_scales, b_scales, b_group_scales, + b_group_size, expert_offsets, problem_sizes, a_strides, b_strides, + c_strides, group_scale_strides); + } else { + TORCH_CHECK(false, + "cutlass_w4a8_moe_mm: unknown schedule string: ", schedule); + } +} + +void mm(torch::Tensor& out_tensors, const torch::Tensor& a_tensors, + const torch::Tensor& b_tensors, const torch::Tensor& a_scales, + const torch::Tensor& b_scales, const torch::Tensor& b_group_scales, + const int64_t b_group_size, const torch::Tensor& expert_offsets, + const torch::Tensor& problem_sizes, const torch::Tensor& a_strides, + const torch::Tensor& b_strides, const torch::Tensor& c_strides, + const torch::Tensor& group_scale_strides, + std::optional maybe_schedule) { + // user has specified a schedule + if (maybe_schedule) { + mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + b_group_scales, b_group_size, expert_offsets, problem_sizes, + a_strides, b_strides, c_strides, group_scale_strides, + *maybe_schedule); + return; + } + + // use heuristic + int m_full = a_tensors.size(0); + int n = b_tensors.size(1); + int k = b_tensors.size(2) * PackFactor; // logical k + int num_experts = b_tensors.size(0); + // per-expert batch size assuming uniform distribution + int m_expert = m_full / num_experts; + + std::string schedule; + if (m_expert <= 16) { + schedule = "Kernel_128x16_2x1x1_Coop"; + } else if (m_expert <= 32) { + schedule = "Kernel_256x32_1x1x1_Coop"; + } else if (m_expert <= 64) { + schedule = "Kernel_256x64_1x1x1_Coop"; + } else if (m_expert <= 128) { + schedule = "Kernel_256x128_2x1x1_Coop"; + } else { // m_expert > 128 + schedule = "Kernel_128x256_2x1x1_Coop"; + } + + mm_dispatch(out_tensors, a_tensors, b_tensors, a_scales, b_scales, + b_group_scales, b_group_size, expert_offsets, problem_sizes, + a_strides, b_strides, c_strides, group_scale_strides, schedule); +} + +std::tuple encode_and_reorder_int4b( + torch::Tensor const& b_tensors) { + TORCH_CHECK(b_tensors.dtype() == torch::kInt32); + TORCH_CHECK(b_tensors.dim() == 3); // (experts, n, k) + TORCH_CHECK(b_tensors.is_contiguous()); + TORCH_CHECK(b_tensors.is_cuda()); + + int n = static_cast(b_tensors.size(1)); + int k = static_cast(b_tensors.size(2)) * PackFactor; // logical k + + // CUTLASS reorder_tensor requires k % 256 == 0 and n % 16 == 0. + // These misalignments cause silent OOB unless run under Compute Sanitizer. + TORCH_CHECK(k % 256 == 0, "logical k must be divisible by 256"); + TORCH_CHECK(n % 16 == 0, "n must be divisible by 16"); + + // we will store the layout to an int32 tensor; + // this is the number of elements we need per layout + constexpr size_t layout_width = sizeof(LayoutB_Reordered) / sizeof(int32_t); + + torch::Tensor b_tensors_packed = torch::empty_like(b_tensors); + int num_experts = static_cast(b_tensors.size(0)); + + auto b_ptr = static_cast(b_tensors.const_data_ptr()); + auto b_packed_ptr = static_cast(b_tensors_packed.data_ptr()); + + // multiply by ull so result does not overflow int32 + size_t num_int4_elems = 1ull * num_experts * n * k; + bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(b_ptr, b_packed_ptr, + num_int4_elems); + TORCH_CHECK(ok, "unified_encode_int4b failed"); + + // construct the layout once; assumes each expert has the same layout + using LayoutType = LayoutB_Reordered; + std::vector layout_B_reordered_host(num_experts); + auto stride_B = cutlass::make_cute_packed_stride(StrideB{}, {n, k, Int<1>{}}); + auto shape_B = cute::make_shape(n, k, Int<1>{}); + auto layout_B = make_layout(shape_B, stride_B); + LayoutType layout_B_reordered = tile_to_shape(LayoutAtomQuant{}, shape_B); + + // reorder weights for each expert + for (int i = 0; i < num_experts; i++) { + // since the storage type of int4b is 1 byte but one element is 4 bits + // we need to adjust the offset + int64_t offset = + 1ull * i * n * k * cutlass::sizeof_bits::value / 8; + cutlass::reorder_tensor(b_packed_ptr + offset, layout_B, + layout_B_reordered); + } + + // save the packed layout to torch tensor so we can re-use it + auto cpu_opts = + torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU); + torch::Tensor layout_cpu = + torch::empty({num_experts, layout_width}, cpu_opts); + + int32_t* layout_data = layout_cpu.data_ptr(); + for (int i = 0; i < num_experts; ++i) { + std::memcpy(layout_data + i * layout_width, // dst (int32*) + &layout_B_reordered, // src (LayoutType*) + sizeof(LayoutType)); // number of bytes + } + + torch::Tensor packed_layout = + layout_cpu.to(b_tensors.device(), /*non_blocking=*/false); + + return {b_tensors_packed, packed_layout}; +} + +TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { + m.impl("cutlass_w4a8_moe_mm", &mm); + m.impl("cutlass_encode_and_reorder_int4b_grouped", &encode_and_reorder_int4b); +} + +} // namespace vllm::cutlass_w4a8_moe +///////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu index 2d1568b08651c..f77af06cd6c08 100644 --- a/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu +++ b/csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu @@ -7,6 +7,7 @@ #include #include #include "cutlass_extensions/torch_utils.hpp" +#include "w4a8_utils.cuh" #include "core/registration.h" @@ -395,71 +396,6 @@ torch::Tensor pack_scale_fp8(torch::Tensor const& scales) { return packed_scales; } -/* - GPU-accelerated implementation of cutlass::unified_encode_int4b. - Constructs a lookup table in constant memory to map 8 bits - (two 4-bit values) at a time. Assumes memory is contiguous - and pointers are 16-byte aligned. -*/ -__constant__ uint8_t kNibbleLUT[256]; - -__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out, - size_t nbytes) { - constexpr size_t V = sizeof(uint4); // 16 bytes - const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; - const size_t nthreads = size_t(gridDim.x) * blockDim.x; - const size_t nvec = nbytes / V; - - // 1-D grid-stride loop over 16-byte chunks - for (size_t vec = tid; vec < nvec; vec += nthreads) { - uint4 v = reinterpret_cast(in)[vec]; - uint8_t* b = reinterpret_cast(&v); -#pragma unroll - for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]]; - reinterpret_cast(out)[vec] = v; - } -} - -static bool upload_lut() { - std::array lut{}; - auto map_nib = [](uint8_t v) -> uint8_t { - // 1..7 -> (8 - v); keep 0 and 8..15 - return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v); - }; - for (int b = 0; b < 256; ++b) { - uint8_t lo = b & 0xF; - uint8_t hi = (b >> 4) & 0xF; - lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo)); - } - cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(), - /*offset=*/0, cudaMemcpyHostToDevice); - - return (e == cudaSuccess); -} - -static bool unified_encode_int4b(cutlass::int4b_t const* in, - cutlass::int4b_t* out, size_t num_int4_elems) { - // Build/upload LUT - if (!upload_lut()) return false; - - static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1, - "int4 storage must be 1 byte"); - const size_t nbytes = num_int4_elems >> 1; - - auto* in_bytes = reinterpret_cast(in); - auto* out_bytes = reinterpret_cast(out); - - // kernel launch params - constexpr int block = 256; - const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors - int grid = int((nvec + block - 1) / block); - if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel - - unified_encode_int4b_device<<>>(in_bytes, out_bytes, nbytes); - cudaError_t err = cudaGetLastError(); - return (err == cudaSuccess); -} - torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { TORCH_CHECK(B.dtype() == torch::kInt32); TORCH_CHECK(B.dim() == 2); @@ -477,8 +413,8 @@ torch::Tensor encode_and_reorder_int4b(torch::Tensor const& B) { LayoutB_Reordered layout_B_reordered = cute::tile_to_shape(LayoutAtomQuant{}, shape_B); - bool ok = - vllm::cutlass_w4a8::unified_encode_int4b(B_ptr, B_packed_ptr, n * k); + bool ok = vllm::cutlass_w4a8_utils::unified_encode_int4b(B_ptr, B_packed_ptr, + n * k); TORCH_CHECK(ok, "unified_encode_int4b failed"); cutlass::reorder_tensor(B_packed_ptr, layout_B, layout_B_reordered); diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cu b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu new file mode 100644 index 0000000000000..f238d0a5b2d78 --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cu @@ -0,0 +1,90 @@ +#include "w4a8_utils.cuh" + +#include +#include +#include + +namespace vllm::cutlass_w4a8_utils { + +/* + GPU-accelerated implementation of cutlass::unified_encode_int4b. + Constructs a lookup table in constant memory to map 8 bits + (two 4-bit values) at a time. Assumes memory is contiguous + and pointers are 16-byte aligned. +*/ +__constant__ uint8_t kNibbleLUT[256]; + +__global__ void unified_encode_int4b_device(const uint8_t* in, uint8_t* out, + size_t nbytes) { + constexpr size_t V = sizeof(uint4); // 16 bytes + const size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + const size_t nthreads = size_t(gridDim.x) * blockDim.x; + const size_t nvec = nbytes / V; + + // 1-D grid-stride loop over 16-byte chunks + for (size_t vec = tid; vec < nvec; vec += nthreads) { + uint4 v = reinterpret_cast(in)[vec]; + uint8_t* b = reinterpret_cast(&v); +#pragma unroll + for (int i = 0; i < int(V); ++i) b[i] = kNibbleLUT[b[i]]; + reinterpret_cast(out)[vec] = v; + } +} + +static bool upload_lut() { + std::array lut{}; + auto map_nib = [](uint8_t v) -> uint8_t { + // 1..7 -> (8 - v); keep 0 and 8..15 + return (v == 0 || (v & 0x8)) ? v : uint8_t(8 - v); + }; + for (int b = 0; b < 256; ++b) { + uint8_t lo = b & 0xF; + uint8_t hi = (b >> 4) & 0xF; + lut[b] = uint8_t((map_nib(hi) << 4) | map_nib(lo)); + } + cudaError_t e = cudaMemcpyToSymbol(kNibbleLUT, lut.data(), lut.size(), + /*offset=*/0, cudaMemcpyHostToDevice); + + return (e == cudaSuccess); +} + +bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out, + size_t num_int4_elems) { + // Build/upload LUT + if (!upload_lut()) return false; + + static_assert(sizeof(typename cutlass::int4b_t::Storage) == 1, + "int4 storage must be 1 byte"); + const size_t nbytes = num_int4_elems >> 1; + + auto* in_bytes = reinterpret_cast(in); + auto* out_bytes = reinterpret_cast(out); + + // kernel launch params + constexpr int block = 256; + const size_t nvec = nbytes / sizeof(uint4); // # of 16B vectors + int grid = int((nvec + block - 1) / block); + if (grid == 0) grid = 1; // ensure we still cover the tail in the kernel + + unified_encode_int4b_device<<>>(in_bytes, out_bytes, nbytes); + + // launch errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("unified_encode_int4b_device launch error: %s (%d)\n", + cudaGetErrorString(err), err); + return false; + } + + // runtime errors + err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { + printf("unified_encode_int4b_device runtime error: %s (%d)\n", + cudaGetErrorString(err), err); + return false; + } + + return true; +} + +} // namespace vllm::cutlass_w4a8_utils \ No newline at end of file diff --git a/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh new file mode 100644 index 0000000000000..25090091a368d --- /dev/null +++ b/csrc/quantization/cutlass_w4a8/w4a8_utils.cuh @@ -0,0 +1,11 @@ +#pragma once + +#include +#include "cutlass/numeric_types.h" + +namespace vllm::cutlass_w4a8_utils { + +bool unified_encode_int4b(cutlass::int4b_t const* in, cutlass::int4b_t* out, + size_t num_int4_elems); + +} // namespace vllm::cutlass_w4a8_utils \ No newline at end of file diff --git a/csrc/quantization/machete/machete_mainloop.cuh b/csrc/quantization/machete/machete_mainloop.cuh index 2f52a6b7a0246..9f02f4f179741 100644 --- a/csrc/quantization/machete/machete_mainloop.cuh +++ b/csrc/quantization/machete/machete_mainloop.cuh @@ -617,7 +617,7 @@ struct MacheteCollectiveMma { // Same as upstream, should be kept the same when possible, not formatted for // easier comparison - // with `SwapAB ? N : M -> M` since we dont support SwapAB + // with `SwapAB ? N : M -> M` since we don't support SwapAB // clang-format off template static bool diff --git a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu index 49cafcc32adc6..99fec8fd6febc 100644 --- a/csrc/quantization/w8a8/cutlass/moe/moe_data.cu +++ b/csrc/quantization/w8a8/cutlass/moe/moe_data.cu @@ -136,15 +136,17 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids, void get_cutlass_moe_mm_problem_sizes_caller( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets) { + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt) { auto stream = at::cuda::getCurrentCUDAStream(topk_ids.device().index()); auto options_int32 = torch::TensorOptions().dtype(torch::kInt32).device(topk_ids.device()); torch::Tensor atomic_buffer = torch::zeros(num_experts, options_int32); // Swap-AB should be disabled for FP4 path - bool may_swap_ab = (!blockscale_offsets.has_value()) && - (topk_ids.numel() <= SWAP_AB_THRESHOLD); + bool may_swap_ab = + force_swap_ab.value_or((!blockscale_offsets.has_value()) && + (topk_ids.numel() <= SWAP_AB_THRESHOLD)); launch_compute_problem_sizes(topk_ids, problem_sizes1, problem_sizes2, atomic_buffer, num_experts, n, k, stream, diff --git a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu index c5012a8669317..5de21cfbbaafb 100644 --- a/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu +++ b/csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu @@ -80,7 +80,8 @@ void get_cutlass_moe_mm_data_caller( void get_cutlass_moe_mm_problem_sizes_caller( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets); + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt); void get_cutlass_pplx_moe_mm_data_caller(torch::Tensor& expert_offsets, torch::Tensor& problem_sizes1, @@ -303,14 +304,15 @@ void get_cutlass_moe_mm_data( void get_cutlass_moe_mm_problem_sizes( const torch::Tensor& topk_ids, torch::Tensor& problem_sizes1, torch::Tensor& problem_sizes2, const int64_t num_experts, const int64_t n, - const int64_t k, const std::optional& blockscale_offsets) { + const int64_t k, const std::optional& blockscale_offsets, + std::optional force_swap_ab = std::nullopt) { int32_t version_num = get_sm_version_num(); #if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \ (defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \ (defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120) get_cutlass_moe_mm_problem_sizes_caller(topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, - blockscale_offsets); + blockscale_offsets, force_swap_ab); return; #endif TORCH_CHECK_NOT_IMPLEMENTED( diff --git a/csrc/rocm/skinny_gemms.cu b/csrc/rocm/skinny_gemms.cu index 2ef579a1b7537..8ebe55cef391d 100644 --- a/csrc/rocm/skinny_gemms.cu +++ b/csrc/rocm/skinny_gemms.cu @@ -1241,33 +1241,16 @@ __global__ void wvSplitK_hf_big_(const int K, const int M, const int Bx, } #endif // defined(__HIP__GFX9__) TODO: Add NAVI support +// Find the min val of div2 that doesn't increase N/(div1*div2) int mindiv(int N, int div1, int div2) { int nPrRnd = div1 * div2; - int rnds0 = N / nPrRnd; - nPrRnd -= div1 * 3; - int rnds3 = N / nPrRnd; - nPrRnd -= div1; - int rnds4 = N / nPrRnd; - nPrRnd -= div1; - int rnds5 = N / nPrRnd; - nPrRnd -= div1; - int rnds6 = N / nPrRnd; - nPrRnd -= div1; - int rnds7 = N / nPrRnd; - nPrRnd -= div1; - int rnds8 = N / nPrRnd; - nPrRnd -= div1; - int rnds9 = N / nPrRnd; - nPrRnd -= div1; - int rtn = div2; - if (rnds0 == rnds3) rtn = div2 - 3; - if (rnds0 == rnds4) rtn = div2 - 4; - if (rnds0 == rnds5) rtn = div2 - 5; - if (rnds0 == rnds6) rtn = div2 - 6; - if (rnds0 == rnds7) rtn = div2 - 7; - if (rnds0 == rnds8) rtn = div2 - 8; - if (rnds0 == rnds9) rtn = div2 - 9; - return rtn; + int rnds[13]; + for (int i = 0; i < 13; i++) { + rnds[i] = (N + nPrRnd - 1) / nPrRnd; + nPrRnd -= div1; + } + for (int i = 12; i >= 0; i--) + if (rnds[0] == rnds[i]) return (div2 - i); } torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, @@ -1300,26 +1283,37 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); const int max_lds_len = get_lds_size() / 2; -#define WVSPLITK(_WvPrGrp, _YTILEs, _YTILEm, _YTILEb, _UNRLs, _UNRLm, _UNRLb, \ - _N) \ - { \ - dim3 block(64, _WvPrGrp); \ - if ((K_in * N_in <= max_lds_len) && (M_in % _YTILEs == 0)) { \ - int __wvPrGrp = mindiv(M_in, CuCount * _YTILEs, _WvPrGrp); \ - wvSplitK_hf_sml_ \ - <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ - biasf4, c, __wvPrGrp, CuCount); \ - } else if (K_in * N_in <= max_lds_len * 1.2) { \ - int __wvPrGrp = mindiv(M_in, CuCount * _YTILEm, _WvPrGrp); \ - wvSplitK_hf_ \ - <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ - biasf4, c, __wvPrGrp, CuCount); \ - } else { \ - int __wvPrGrp = mindiv(M_in, CuCount * _YTILEb, _WvPrGrp); \ - wvSplitK_hf_big_ \ - <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ - biasf4, c, __wvPrGrp, CuCount); \ - } \ +#define WVSPLITK(_YTILE, _UNRL, _N) \ + { \ + dim3 block(64, 16); \ + int __wvPrGrp = mindiv(M_in, CuCount * _YTILE, 16); \ + if ((K_in * N_in <= max_lds_len) && (M_in % _YTILE == 0)) \ + wvSplitK_hf_sml_ \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ + else if (K_in * N_in <= max_lds_len * 1.2) \ + wvSplitK_hf_ \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ + else \ + wvSplitK_hf_big_ \ + <<>>(K_in, M_in, Bx_in, By_in, af4, bf4, \ + biasf4, c, __wvPrGrp, CuCount); \ + } + +#define WVSPLIT_TILE(_sYT, __N) \ + { \ + bool fit_lds = (K_in * N_in <= max_lds_len); \ + if (_sYT <= 1) \ + WVSPLITK(1, 4, __N) \ + else if ((__N == 1) || (!fit_lds) || (_sYT <= 4 * 2)) \ + WVSPLITK(2, 2, __N) \ + else if (_sYT <= 4 * 3) \ + WVSPLITK(3, 2, __N) \ + else if (__N == 4) \ + WVSPLITK(4, 1, __N) \ + else \ + WVSPLITK(4, 2, __N) \ } AT_DISPATCH_REDUCED_FLOATING_TYPES(in_b.scalar_type(), "wvSplitK", [&] { @@ -1331,18 +1325,23 @@ torch::Tensor wvSplitK(const at::Tensor& in_a, const at::Tensor& in_b, ? reinterpret_cast(in_bias->data_ptr()) : nullptr; fptype* c = reinterpret_cast(out_c.data_ptr()); + + // first shoot for biggest tile-size that keeps all simd busy, + // then cut the active waves to balance their distribution... + int sYT = (M_in + CuCount * 4 - 1) / (CuCount * 4); + switch (N_in) { case 1: - WVSPLITK(16, 2, 2, 2, 2, 2, 2, 1) + WVSPLIT_TILE(sYT, 1) break; case 2: - WVSPLITK(16, 2, 2, 2, 2, 2, 2, 2) + WVSPLIT_TILE(sYT, 2) break; case 3: - WVSPLITK(16, 4, 7, 7, 1, 1, 1, 3) + WVSPLIT_TILE(sYT, 3) break; case 4: - WVSPLITK(16, 4, 7, 7, 1, 1, 1, 4) + WVSPLIT_TILE(sYT, 4) break; default: throw std::runtime_error( diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp index 62212f98b4766..d4c6f8c67c516 100644 --- a/csrc/torch_bindings.cpp +++ b/csrc/torch_bindings.cpp @@ -350,6 +350,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ops.def("cutlass_encode_and_reorder_int4b(Tensor B) -> Tensor"); // conditionally compiled so impl registration is in source file + // CUTLASS w4a8 grouped GEMM + ops.def( + "cutlass_w4a8_moe_mm(" + " Tensor! out_tensors," + " Tensor a_tensors," + " Tensor b_tensors," + " Tensor a_scales," + " Tensor b_scales," + " Tensor b_group_scales," + " int b_group_size," + " Tensor expert_offsets," + " Tensor problem_sizes," + " Tensor a_strides," + " Tensor b_strides," + " Tensor c_strides," + " Tensor group_scale_strides," + " str? maybe_schedule" + ") -> ()"); + ops.def( + "cutlass_encode_and_reorder_int4b_grouped(Tensor b_tensors) -> (Tensor, " + "Tensor)"); + // conditionally compiled so impl registration is in source file + #endif // Dequantization for GGML. @@ -466,7 +489,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { " Tensor! problem_sizes1, " " Tensor! problem_sizes2, " " int num_experts, int n, int k, " - " Tensor? blockscale_offsets) -> ()"); + " Tensor? blockscale_offsets, " + " bool? force_swap_ab) -> ()"); ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, &get_cutlass_moe_mm_problem_sizes); diff --git a/docs/api/README.md b/docs/api/README.md index d3a141f327308..d51329ec2faa3 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -15,6 +15,7 @@ API documentation for vLLM's configuration classes. - [vllm.config.MultiModalConfig][] - [vllm.config.PoolerConfig][] - [vllm.config.StructuredOutputsConfig][] +- [vllm.config.ProfilerConfig][] - [vllm.config.ObservabilityConfig][] - [vllm.config.KVTransferConfig][] - [vllm.config.CompilationConfig][] diff --git a/docs/benchmarking/cli.md b/docs/benchmarking/cli.md index 1ce6b611745b1..dd5a12e408b02 100644 --- a/docs/benchmarking/cli.md +++ b/docs/benchmarking/cli.md @@ -84,7 +84,7 @@ Total input tokens: 1369 Total generated tokens: 2212 Request throughput (req/s): 1.73 Output token throughput (tok/s): 382.89 -Total Token throughput (tok/s): 619.85 +Total token throughput (tok/s): 619.85 ---------------Time to First Token---------------- Mean TTFT (ms): 71.54 Median TTFT (ms): 73.88 diff --git a/docs/contributing/profiling.md b/docs/contributing/profiling.md index 65382afbe4f21..cbce14ce992ec 100644 --- a/docs/contributing/profiling.md +++ b/docs/contributing/profiling.md @@ -5,16 +5,15 @@ ## Profile with PyTorch Profiler -We support tracing vLLM workers using the `torch.profiler` module. You can enable tracing by setting the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where you want to save the traces: `VLLM_TORCH_PROFILER_DIR=/mnt/traces/`. Additionally, you can control the profiling content by specifying the following environment variables: +We support tracing vLLM workers using the `torch.profiler` module. You can enable the torch profiler by setting `--profiler-config` +when launching the server, and setting the entries `profiler` to `'torch'` and `torch_profiler_dir` to the directory where you want to save the traces. Additionally, you can control the profiling content by specifying the following additional arguments in the config: -- `VLLM_TORCH_PROFILER_RECORD_SHAPES=1` to enable recording Tensor Shapes, off by default -- `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1` to record memory, off by default -- `VLLM_TORCH_PROFILER_WITH_STACK=1` to enable recording stack information, on by default -- `VLLM_TORCH_PROFILER_WITH_FLOPS=1` to enable recording FLOPs, off by default -- `VLLM_TORCH_PROFILER_USE_GZIP=0` to disable gzip-compressing profiling files, on by default -- `VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0` to disable dumping and printing the aggregated CUDA self time table, on by default - -The OpenAI server also needs to be started with the `VLLM_TORCH_PROFILER_DIR` environment variable set. +- `torch_profiler_record_shapes` to enable recording Tensor Shapes, off by default +- `torch_profiler_with_memory` to record memory, off by default +- `torch_profiler_with_stack` to enable recording stack information, on by default +- `torch_profiler_with_flops` to enable recording FLOPs, off by default +- `torch_profiler_use_gzip` to control gzip-compressing profiling files, on by default +- `torch_profiler_dump_cuda_time_total` to control dumping and printing the aggregated CUDA self time table, on by default When using `vllm bench serve`, you can enable profiling by passing the `--profile` flag. @@ -40,8 +39,7 @@ Refer to [examples/offline_inference/simple_profiling.py](../../examples/offline #### OpenAI Server ```bash -VLLM_TORCH_PROFILER_DIR=./vllm_profile \ - vllm serve meta-llama/Llama-3.1-8B-Instruct +vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config '{"profiler": "torch", "torch_profiler_dir": "./vllm_profile"}' ``` vllm bench command: @@ -104,13 +102,12 @@ To profile the server, you will want to prepend your `vllm serve` command with ` ```bash # server -VLLM_TORCH_CUDA_PROFILE=1 \ nsys profile \ --trace-fork-before-exec=true \ --cuda-graph-trace=node \ --capture-range=cudaProfilerApi \ --capture-range-end repeat \ - vllm serve meta-llama/Llama-3.1-8B-Instruct + vllm serve meta-llama/Llama-3.1-8B-Instruct --profiler-config.profiler cuda # client vllm bench serve \ diff --git a/docs/design/metrics.md b/docs/design/metrics.md index 28b5405871ac2..2722e12fdaeaf 100644 --- a/docs/design/metrics.md +++ b/docs/design/metrics.md @@ -21,30 +21,20 @@ The mental model is that server-level metrics help explain the values of request ### v1 Metrics -In v1, the following metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix: +In v1, an extensive set of metrics are exposed via a Prometheus-compatible `/metrics` endpoint using the `vllm:` prefix, for example: - `vllm:num_requests_running` (Gauge) - Number of requests currently running. -- `vllm:num_requests_waiting` (Gauge) - Number of requests currently waiting. - `vllm:kv_cache_usage_perc` (Gauge) - Fraction of used KV cache blocks (0–1). - `vllm:prefix_cache_queries` (Counter) - Number of prefix cache queries. - `vllm:prefix_cache_hits` (Counter) - Number of prefix cache hits. -- `vllm:mm_cache_queries` (Counter) - (For multimodal models) Number of multimodal cache queries. -- `vllm:mm_cache_hits` (Counter) - (For multimodal models) Number of multimodal cache hits. -- `vllm:num_preemptions_total` (Counter) - Number of preemptions. - `vllm:prompt_tokens_total` (Counter) - Total number of prompt tokens processed. - `vllm:generation_tokens_total` (Counter) - Total number of generated tokens. -- `vllm:iteration_tokens_total` (Histogram) - Histogram of tokens processed in each engine step. -- `vllm:cache_config_info` (Gauge) - Information about the cache configuration. - `vllm:request_success_total` (Counter) - Number of finished requests (by finish reason). - `vllm:request_prompt_tokens` (Histogram) - Histogram of input prompt token counts. - `vllm:request_generation_tokens` (Histogram) - Histogram of generation token counts. -- `vllm:request_params_n` (Histogram) - Histogram of request parameter n. -- `vllm:request_params_max_tokens` - (Histogram) - Histogram of max_tokens parameter in requests. - `vllm:time_to_first_token_seconds` (Histogram) - Time to first token (TTFT). - `vllm:inter_token_latency_seconds` (Histogram) - Inter-token latency. - `vllm:e2e_request_latency_seconds` (Histogram) - End-to-end request latency. -- `vllm:request_queue_time_seconds` (Histogram) - Time spent in the queue. -- `vllm:request_inference_time_seconds` (Histogram) - Request inference time. - `vllm:request_prefill_time_seconds` (Histogram) - Request prefill time. - `vllm:request_decode_time_seconds` (Histogram) - Request decode time. diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 3485c40c36811..b0ca2dad23d5b 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -152,5 +152,5 @@ The interface for the model/module may change during vLLM's development. If you ## Deprecation announcement !!! warning "Deprecations" - - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It will be removed in v0.13.0 or v1.0.0. - - `_Backend` in `vllm.attention` is deprecated. It will be removed in v0.13.0 or v1.0.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. + - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. + - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. diff --git a/docs/features/README.md b/docs/features/README.md index 684802301a44f..e9e5232929b72 100644 --- a/docs/features/README.md +++ b/docs/features/README.md @@ -68,8 +68,8 @@ th:not(:first-child) { | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | [❌](https://github.com/vllm-project/vllm/issues/26970) | | [pooling](../models/pooling_models.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | enc-dec | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | -| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | [🟠](https://github.com/vllm-project/vllm/issues/26965) | -| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | +| [mm](multimodal_inputs.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| [prompt-embeds](prompt_embeds.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | prmpt logP | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | async output | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | diff --git a/docs/features/disagg_encoder.md b/docs/features/disagg_encoder.md index 7d40af7069822..f18a0e85e4b3b 100644 --- a/docs/features/disagg_encoder.md +++ b/docs/features/disagg_encoder.md @@ -32,14 +32,14 @@ Design doc: bool: return self.end_token_id in input_ids + + def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool: + return self.end_token_id in delta_token_ids ... ``` diff --git a/docs/features/structured_outputs.md b/docs/features/structured_outputs.md index 7d52891bea7b9..3ac987559e622 100644 --- a/docs/features/structured_outputs.md +++ b/docs/features/structured_outputs.md @@ -61,7 +61,7 @@ Now let´s see an example for each of the cases, starting with the `choice`, as print(completion.choices[0].message.content) ``` -The next example shows how to use the `regex`. The idea is to generate an email address, given a simple regex template: +The next example shows how to use the `regex`. The supported regex syntax depends on the structured output backend. For example, `xgrammar`, `guidance`, and `outlines` use Rust-style regex, while `lm-format-enforcer` uses Python's `re` module. The idea is to generate an email address, given a simple regex template: ??? code diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md index d5082bc7dd3a9..cff7ce1a882a1 100644 --- a/docs/getting_started/installation/README.md +++ b/docs/getting_started/installation/README.md @@ -26,3 +26,4 @@ The backends below live **outside** the main `vllm` repository and follow the | Rebellions ATOM / REBEL NPU | `vllm-rbln` | | | IBM Spyre AIU | `vllm-spyre` | | | Cambricon MLU | `vllm-mlu` | | +| Baidu Kunlun XPU | N/A, install from source | | diff --git a/docs/getting_started/installation/cpu.arm.inc.md b/docs/getting_started/installation/cpu.arm.inc.md index 156f31f633d57..8ec18bcb826ec 100644 --- a/docs/getting_started/installation/cpu.arm.inc.md +++ b/docs/getting_started/installation/cpu.arm.inc.md @@ -29,8 +29,27 @@ uv pip install --pre vllm==+cpu --extra-index-url https://wheels.vllm.a The `uv` approach works for vLLM `v0.6.6` and later. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version. -!!! note - Nightly wheels are currently unsupported for this architecture. (e.g. to bisect the behavior change, performance regression). +**Install the latest code** + +LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides working pre-built Arm CPU wheels for every commit since `v0.11.2` on . For native CPU wheels, this index should be used: + +* `https://wheels.vllm.ai/nightly/cpu/vllm` + +To install from nightly index, copy the link address of the `*.whl` under this index to run, for example: + +```bash +uv pip install -U https://wheels.vllm.ai/c756fb678184b867ed94e5613a529198f1aee423/vllm-0.13.0rc2.dev11%2Bgc756fb678.cpu-cp38-abi3-manylinux_2_31_aarch64.whl # current nightly build (the filename will change!) +``` + +**Install specific revisions** + +If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), specify the full commit hash in the index: +https://wheels.vllm.ai/${VLLM_COMMIT}/cpu/vllm . +Then, copy the link address of the `*.whl` under this index to run: + +```bash +uv pip install -U +``` # --8<-- [end:pre-built-wheels] # --8<-- [start:build-wheel-from-source] diff --git a/docs/mkdocs/hooks/generate_metrics.py b/docs/mkdocs/hooks/generate_metrics.py new file mode 100644 index 0000000000000..b20d43c4b2e92 --- /dev/null +++ b/docs/mkdocs/hooks/generate_metrics.py @@ -0,0 +1,149 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import ast +import logging +from pathlib import Path +from typing import Literal + +logger = logging.getLogger("mkdocs") + +ROOT_DIR = Path(__file__).parent.parent.parent.parent +DOCS_DIR = ROOT_DIR / "docs" +GENERATED_METRICS_DIR = DOCS_DIR / "generated" / "metrics" + +# Files to scan for metric definitions - each will generate a separate table +METRIC_SOURCE_FILES = [ + {"path": "vllm/v1/metrics/loggers.py", "output": "general.md"}, + { + "path": "vllm/v1/spec_decode/metrics.py", + "output": "spec_decode.md", + }, + { + "path": "vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py", + "output": "nixl_connector.md", + }, +] + + +class MetricExtractor(ast.NodeVisitor): + """AST visitor to extract metric definitions.""" + + def __init__(self): + self.metrics: list[dict[str, str]] = [] + + def visit_Call(self, node: ast.Call) -> None: + """Visit function calls to find metric class instantiations.""" + metric_type = self._get_metric_type(node) + if metric_type: + name = self._extract_kwarg(node, "name") + documentation = self._extract_kwarg(node, "documentation") + + if name: + self.metrics.append( + { + "name": name, + "type": metric_type, + "documentation": documentation or "", + } + ) + + self.generic_visit(node) + + def _get_metric_type(self, node: ast.Call) -> str | None: + """Determine if this call creates a metric and return its type.""" + metric_type_map = { + "_gauge_cls": "gauge", + "_counter_cls": "counter", + "_histogram_cls": "histogram", + } + if isinstance(node.func, ast.Attribute): + return metric_type_map.get(node.func.attr) + return None + + def _extract_kwarg(self, node: ast.Call, key: str) -> str | None: + """Extract a keyword argument value from a function call.""" + for keyword in node.keywords: + if keyword.arg == key: + return self._get_string_value(keyword.value) + return None + + def _get_string_value(self, node: ast.AST) -> str | None: + """Extract string value from an AST node.""" + if isinstance(node, ast.Constant): + return str(node.value) if node.value is not None else None + return None + + +def extract_metrics_from_file(filepath: Path) -> list[dict[str, str]]: + """Parse a Python file and extract all metric definitions.""" + try: + with open(filepath, encoding="utf-8") as f: + source = f.read() + + tree = ast.parse(source, filename=str(filepath)) + extractor = MetricExtractor() + extractor.visit(tree) + return extractor.metrics + except Exception as e: + raise RuntimeError(f"Failed to parse {filepath}: {e}") from e + + +def generate_markdown_table(metrics: list[dict[str, str]]) -> str: + """Generate a markdown table from extracted metrics.""" + if not metrics: + return "No metrics found.\n" + + # Sort by type, then by name + metrics_sorted = sorted(metrics, key=lambda m: (m["type"], m["name"])) + + lines = [] + lines.append("| Metric Name | Type | Description |") + lines.append("|-------------|------|-------------|") + + for metric in metrics_sorted: + name = metric["name"] + metric_type = metric["type"].capitalize() + doc = metric["documentation"].replace("\n", " ").strip() + lines.append(f"| `{name}` | {metric_type} | {doc} |") + + return "\n".join(lines) + "\n" + + +def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): + """Generate metrics documentation tables from source files.""" + logger.info("Generating metrics documentation") + + # Create generated directory if it doesn't exist + GENERATED_METRICS_DIR.mkdir(parents=True, exist_ok=True) + + total_metrics = 0 + for source_config in METRIC_SOURCE_FILES: + source_path = source_config["path"] + output_file = source_config["output"] + + filepath = ROOT_DIR / source_path + if not filepath.exists(): + raise FileNotFoundError(f"Metrics source file not found: {filepath}") + + logger.debug("Extracting metrics from: %s", source_path) + metrics = extract_metrics_from_file(filepath) + logger.debug("Found %d metrics in %s", len(metrics), source_path) + + # Generate and write the markdown table for this source + table_content = generate_markdown_table(metrics) + output_path = GENERATED_METRICS_DIR / output_file + with open(output_path, "w", encoding="utf-8") as f: + f.write(table_content) + + total_metrics += len(metrics) + logger.info( + "Generated metrics table: %s (%d metrics)", + output_path.relative_to(ROOT_DIR), + len(metrics), + ) + + logger.info( + "Total metrics generated: %d across %d files", + total_metrics, + len(METRIC_SOURCE_FILES), + ) diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 32ffcf96fabef..b4b0150faf841 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -316,10 +316,13 @@ We have split the `encode` task into two more specific token-wise tasks: `token_ ### Remove softmax from PoolingParams -We are going to remove `softmax` and `activation` from `PoolingParams`. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function. +We are going to remove `softmax` and `activation` from `PoolingParams` in v0.15. Instead, use `use_activation`, since we allow `classify` and `token_classify` to use any activation function. ### as_reward_model +!!! warning + We are going to remove `--convert reward` in v0.15, use `--convert embed` instead. + Pooling models now default support all pooling, you can use it without any settings. - Extracting hidden states prefers using `token_embed` task. diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index d0166060c267a..586d5d91634dc 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -568,7 +568,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker.py](../../examples/pooling/score/qwen3_reranker.py). + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/offline_reranker.py](../../examples/pooling/score/offline_reranker.py). ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' diff --git a/docs/serving/data_parallel_deployment.md b/docs/serving/data_parallel_deployment.md index eff9c5d5e4efa..e5954917cd790 100644 --- a/docs/serving/data_parallel_deployment.md +++ b/docs/serving/data_parallel_deployment.md @@ -24,7 +24,7 @@ There are two distinct modes supported for online deployments - self-contained w vLLM supports "self-contained" data parallel deployments that expose a single API endpoint. -It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. +It can be configured by simply including e.g. `--data-parallel-size=4` in the vllm serve command line arguments. This will require 4 GPUs. It can be combined with tensor parallel, for example `--data-parallel-size=4 --tensor-parallel-size=2`, which would require 8 GPUs. When sizing DP deployments, remember that `--max-num-seqs` applies per DP rank. Running a single data parallel deployment across multiple nodes requires a different `vllm serve` to be run on each node, specifying which DP ranks should run on that node. In this case, there will still be a single HTTP entrypoint - the API server(s) will run only on one node, but it doesn't necessarily need to be co-located with the DP ranks. @@ -80,6 +80,18 @@ When deploying large DP sizes using this method, the API server process can beco ![DP Internal LB Diagram](../assets/deployment/dp_internal_lb.png) +## Hybrid Load Balancing + +Hybrid load balancing sits between the internal and external approaches. Each node runs its own API server(s) that only queue requests to the data-parallel engines colocated on that node. An upstream load balancer (for example, an ingress controller or traffic router) spreads user requests across those per-node endpoints. + +Enable this mode with `--data-parallel-hybrid-lb` while still launching every node with the global data-parallel size. The key differences from internal load balancing are: + +- You must provide `--data-parallel-size-local` and `--data-parallel-start-rank` so each node knows which ranks it owns. +- Not compatible with `--headless` since every node exposes an API endpoint. +- Scale `--api-server-count` per node based on the number of local ranks + +In this configuration, each node keeps scheduling decisions local, which reduces cross-node traffic and avoids single node bottlenecks at larger DP sizes. + ## External Load Balancing For larger scale deployments especially, it can make sense to handle the orchestration and load balancing of data parallel ranks externally. diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index ec07896592ba3..923020dc88c91 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -40,10 +40,12 @@ EP_SIZE = TP_SIZE × DP_SIZE Where: -- `TP_SIZE`: Tensor parallel size (always 1 for now) +- `TP_SIZE`: Tensor parallel size - `DP_SIZE`: Data parallel size - `EP_SIZE`: Expert parallel size (computed automatically) +When EP is enabled, MoE layers use expert parallelism instead of tensor parallelism, while attention layers continue to use tensor parallelism if `TP_SIZE > 1`. + ### Example Command The following command serves a `DeepSeek-V3-0324` model with 1-way tensor parallel, 8-way (attention) data parallel, and 8-way expert parallel. The attention weights are replicated across all GPUs, while the expert weights are split across GPUs. It will work on a H200 (or H20) node with 8 GPUs. For H100, you can try to serve a smaller model or refer to the multi-node deployment section. @@ -81,7 +83,7 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \ --data-parallel-size-local 8 \ # Local DP size on this node (8 GPUs per node) --data-parallel-address 192.168.1.100 \ # Replace with actual IP of Node 1 --data-parallel-rpc-port 13345 \ # RPC communication port, can be any port as long as reachable by all nodes - --api-server-count=8 # Number of API servers for load handling (scaling this out to total ranks are recommended) + --api-server-count=8 # Number of API servers for load handling (scaling this out to # local ranks is recommended) # Node 2 (Secondary - headless mode, no API server) vllm serve deepseek-ai/DeepSeek-V3-0324 \ @@ -119,9 +121,6 @@ While MoE models are typically trained so that each expert receives a similar nu Enable EPLB with the `--enable-eplb` flag. -!!! note "Model Support" - Currently only DeepSeek V3 architecture is supported. - When enabled, vLLM collects load statistics with every forward pass and periodically rebalances expert distribution. ### EPLB Parameters @@ -134,6 +133,8 @@ Configure EPLB with the `--eplb-config` argument, which accepts a JSON string. T | `step_interval`| Frequency of rebalancing (every N engine steps) | 3000 | | `log_balancedness` | Log balancedness metrics (avg tokens per expert ÷ max tokens per expert) | `false` | | `num_redundant_experts` | Additional global experts per EP rank beyond equal distribution | `0` | +| `use_async` | Use non-blocking EPLB for reduced latency overhead | `false` | +| `policy` | The policy type for expert parallel load balancing | `"default"` | For example: @@ -183,6 +184,26 @@ vllm serve deepseek-ai/DeepSeek-V3-0324 \ For multi-node deployment, add these EPLB flags to each node's command. We recommend setting `--eplb-config '{"num_redundant_experts":32}'` to 32 in large scale use cases so the most popular experts are always available. +## Advanced Configuration + +### Performance Optimization + +- **DeepEP kernels**: The `high_throughput` and `low_latency` kernels are optimized for disaggregated serving and may show poor performance for mixed workloads +- **Dual Batch Overlap**: Use `--enable-dbo` to overlap all-to-all communication with compute. See [Dual Batch Overlap](../design/dbo.md) for more details. +- **Async scheduling (experimental)**: Try `--async-scheduling` to overlap scheduling with model execution. + +### Troubleshooting + +- **`non-zero status: 7 cannot register cq buf`**: When using Infiniband/RoCE, make sure host VM and pods show `ulimit -l` "unlimited". +- **`init failed for transport: IBGDA`**: The InfiniBand GDA kernel modules are missing. Run `tools/ep_kernels/configure_system_drivers.sh` on each GPU node and reboot. Also fixes error `NVSHMEM API called before NVSHMEM initialization has completed`. +- **NVSHMEM peer disconnect**: Usually a networking misconfiguration. If deploying via Kubernetes, verify that every pod runs with `hostNetwork: true`, `securityContext.privileged: true` to access Infiniband. + +### Benchmarking + +- Use simulator flags `VLLM_MOE_ROUTING_SIMULATION_STRATEGY=uniform_random` and `VLLM_RANDOMIZE_DP_DUMMY_INPUTS=1` so token routing is balanced across EP ranks. + +- Increasing `VLLM_MOE_DP_CHUNK_SIZE` may increase throughput by increasing the maximum batch size for inter-rank token transfers. This may cause DeepEP to throw `assert self.nvshmem_qp_depth >= (num_max_dispatch_tokens_per_rank + 1) * 2`, which can be fixed by increasing environment variable `NVSHMEM_QP_DEPTH`. + ## Disaggregated Serving (Prefill/Decode Split) For production deployments requiring strict SLA guarantees for time-to-first-token and inter-token latency, disaggregated serving allows independent scaling of prefill and decode operations. @@ -273,3 +294,9 @@ except Exception as e: print(f"❌ Error during disaggregated serving: {e}") print("Check that both prefill and decode instances are running and accessible") ``` + +### Benchmarking + +- To simulate the decode deployment of disaggregated serving, pass `--kv-transfer-config '{"kv_connector":"DecodeBenchConnector","kv_role":"kv_both"}'` to the `vllm serve` invocation. The connector populates KV cache with random values so decode can be profiled in isolation. + +- **CUDAGraph capture**: Use `--compilation_config '{"cudagraph_mode": "FULL_DECODE_ONLY"}'` to enable CUDA graph capture for decode only and save KV cache. diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index 01453483a8d60..0e29204f8947c 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -851,7 +851,7 @@ endpoints are compatible with both [Jina AI's re-rank API interface](https://jin [Cohere's re-rank API interface](https://docs.cohere.com/v2/reference/rerank) to ensure compatibility with popular open-source tools. -Code example: [examples/pooling/score/jinaai_rerank_client.py](../../examples/pooling/score/jinaai_rerank_client.py) +Code example: [examples/pooling/score/openai_reranker.py](../../examples/pooling/score/openai_reranker.py) #### Example Request diff --git a/docs/usage/metrics.md b/docs/usage/metrics.md index d756e32476f0a..829533b84328f 100644 --- a/docs/usage/metrics.md +++ b/docs/usage/metrics.md @@ -33,11 +33,19 @@ Then query the endpoint to get the latest metrics from the server: The following metrics are exposed: -??? code +## General Metrics - ```python - --8<-- "vllm/engine/metrics.py:metrics-definitions" - ``` +--8<-- "docs/generated/metrics/general.md" + +## Speculative Decoding Metrics + +--8<-- "docs/generated/metrics/spec_decode.md" + +## NIXL KV Connector Metrics + +--8<-- "docs/generated/metrics/nixl_connector.md" + +## Deprecation Policy Note: when metrics are deprecated in version `X.Y`, they are hidden in version `X.Y+1` but can be re-enabled using the `--show-hidden-metrics-for-version=X.Y` escape hatch, diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py index df6e96ca375fc..40462c78ae8c2 100755 --- a/examples/offline_inference/audio_language.py +++ b/examples/offline_inference/audio_language.py @@ -422,7 +422,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py index 8f3d1a5c00369..2d575840e6a71 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py @@ -30,7 +30,7 @@ def main(): max_num_batched_tokens=64, max_num_seqs=16, kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ), diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py index 0bfe7ec0e6cf6..207c6daebc2f5 100644 --- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py +++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py @@ -26,7 +26,7 @@ def main(): enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ), diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py index c1d6c6db53dfb..857767ac3c628 100644 --- a/examples/offline_inference/encoder_decoder_multimodal.py +++ b/examples/offline_inference/encoder_decoder_multimodal.py @@ -77,7 +77,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/examples/offline_inference/kv_load_failure_recovery/README.md b/examples/offline_inference/kv_load_failure_recovery/README.md index 230a16812b25e..1f29a6ff56dbc 100644 --- a/examples/offline_inference/kv_load_failure_recovery/README.md +++ b/examples/offline_inference/kv_load_failure_recovery/README.md @@ -10,7 +10,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron - `decode_example.py` – performs the decode stage. Accepts: - `--simulate-failure`: simulates KV load failure using a custom connector. - `--async-load`: enables asynchronous KV loading mode. -- `rogue_shared_storage_connector.py` – defines `RogueSharedStorageConnector`, a subclass of `SharedStorageConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request. +- `load_recovery_example_connector.py` – defines `LoadRecoveryExampleConnector`, a subclass of `ExampleConnector`, that simulates missing or corrupted external KV blocks by failing to load blocks for the first decode request. - `run.sh` – orchestrates the test: runs the prefill stage, then three decode stages: 1. Normal decode (baseline). 2. Decode with simulated sync KV load failure. @@ -20,7 +20,7 @@ It demonstrates vLLM's ability to recover from KV load failures in both synchron ## How It Works -- The test dynamically loads `RogueSharedStorageConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector. +- The test dynamically loads `LoadRecoveryExampleConnector` via `KVTransferConfig.kv_connector_module_path`, enabling controlled simulation of load failures without modifying the original connector. - The decode stages that simulate failure are expected to trigger recovery logic in vLLM, resulting in the same output as the baseline decode. - If recovery fails, the script prints a unified diff of the output mismatch and exits with error. diff --git a/examples/offline_inference/kv_load_failure_recovery/decode_example.py b/examples/offline_inference/kv_load_failure_recovery/decode_example.py index 69523f56eace3..d0df54167aeac 100644 --- a/examples/offline_inference/kv_load_failure_recovery/decode_example.py +++ b/examples/offline_inference/kv_load_failure_recovery/decode_example.py @@ -35,13 +35,13 @@ def main(): if args.simulate_failure: ktc = KVTransferConfig( - kv_connector="RogueSharedStorageConnector", + kv_connector="LoadRecoveryExampleConnector", kv_role="kv_both", kv_connector_extra_config={ "shared_storage_path": "local_storage", "async_load": args.async_load, }, - kv_connector_module_path="rogue_shared_storage_connector", + kv_connector_module_path="load_recovery_example_connector", ) out_file = ( "async_decode_recovered_output.txt" @@ -50,7 +50,7 @@ def main(): ) else: ktc = KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={ "shared_storage_path": "local_storage", diff --git a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py similarity index 88% rename from examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py rename to examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py index 5b2acea4c9457..7aab07f8a2c33 100644 --- a/examples/offline_inference/kv_load_failure_recovery/rogue_shared_storage_connector.py +++ b/examples/offline_inference/kv_load_failure_recovery/load_recovery_example_connector.py @@ -10,9 +10,9 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorMetadata, KVConnectorRole, ) -from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( - SharedStorageConnector, - SharedStorageConnectorMetadata, +from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import ( + ExampleConnector, + ExampleConnectorMetadata, ) from vllm.forward_context import ForwardContext from vllm.v1.core.kv_cache_manager import KVCacheBlocks @@ -26,15 +26,15 @@ logging.basicConfig(level=logging.INFO) @dataclass -class RogueSharedStorageConnectorMetadata(SharedStorageConnectorMetadata): +class LoadRecoveryExampleConnectorMetadata(ExampleConnectorMetadata): req_to_block_ids: dict[str, set[int]] = field(default_factory=dict) @classmethod - def from_base(cls, base: SharedStorageConnectorMetadata): + def from_base(cls, base: ExampleConnectorMetadata): return cls(requests=base.requests) -class RogueSharedStorageConnector(SharedStorageConnector): +class LoadRecoveryExampleConnector(ExampleConnector): def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole): super().__init__(vllm_config=vllm_config, role=role) self._async_load = vllm_config.kv_transfer_config.get_from_extra_config( @@ -45,7 +45,7 @@ class RogueSharedStorageConnector(SharedStorageConnector): self._req_to_block_ids: dict[str, list[int]] = dict() def bind_connector_metadata(self, connector_metadata: KVConnectorMetadata) -> None: - assert isinstance(connector_metadata, RogueSharedStorageConnectorMetadata) + assert isinstance(connector_metadata, LoadRecoveryExampleConnectorMetadata) index, failed_request = next( ( (i, x) @@ -84,7 +84,7 @@ class RogueSharedStorageConnector(SharedStorageConnector): ) -> tuple[set[str] | None, set[str] | None]: if self._async_load: meta = self._get_connector_metadata() - assert isinstance(meta, RogueSharedStorageConnectorMetadata) + assert isinstance(meta, LoadRecoveryExampleConnectorMetadata) if meta.req_to_block_ids: return None, set(meta.req_to_block_ids) @@ -126,9 +126,9 @@ class RogueSharedStorageConnector(SharedStorageConnector): ) -> KVConnectorMetadata: if not self._async_load: base = super().build_connector_meta(scheduler_output) - meta = RogueSharedStorageConnectorMetadata.from_base(base) + meta = LoadRecoveryExampleConnectorMetadata.from_base(base) else: - meta = RogueSharedStorageConnectorMetadata() + meta = LoadRecoveryExampleConnectorMetadata() if self._requests_need_load: for req_id, request in self._requests_need_load.items(): meta.add_request( diff --git a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py index 047b81c82df53..ee4a84fd95003 100644 --- a/examples/offline_inference/kv_load_failure_recovery/prefill_example.py +++ b/examples/offline_inference/kv_load_failure_recovery/prefill_example.py @@ -26,7 +26,7 @@ def main(): enforce_eager=True, gpu_memory_utilization=0.8, kv_transfer_config=KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ), diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py index ed005e6a69b80..cee83519fadcc 100644 --- a/examples/offline_inference/qwen2_5_omni/only_thinker.py +++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/qwen3_omni/only_thinker.py b/examples/offline_inference/qwen3_omni/only_thinker.py index 88a61ed694c2e..62131633da8aa 100644 --- a/examples/offline_inference/qwen3_omni/only_thinker.py +++ b/examples/offline_inference/qwen3_omni/only_thinker.py @@ -158,7 +158,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/simple_profiling.py b/examples/offline_inference/simple_profiling.py index 46858fffadc52..e8a75cd03befb 100644 --- a/examples/offline_inference/simple_profiling.py +++ b/examples/offline_inference/simple_profiling.py @@ -1,14 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import os import time from vllm import LLM, SamplingParams -# enable torch profiler, can also be set on cmd line -os.environ["VLLM_TORCH_PROFILER_DIR"] = "./vllm_profile" - # Sample prompts. prompts = [ "Hello, my name is", @@ -22,7 +18,14 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) def main(): # Create an LLM. - llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) + llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=1, + profiler_config={ + "profiler": "torch", + "torch_profiler_dir": "./vllm_profile", + }, + ) llm.start_profile() diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py index 22802dddf7893..9142279140e56 100755 --- a/examples/offline_inference/vision_language.py +++ b/examples/offline_inference/vision_language.py @@ -2031,7 +2031,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py index 28c466c03dfa5..3c01806baa203 100755 --- a/examples/offline_inference/vision_language_multi_image.py +++ b/examples/offline_inference/vision_language_multi_image.py @@ -1382,7 +1382,7 @@ def run_generate( model, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1416,7 +1416,7 @@ def run_chat( model: str, question: str, image_urls: list[str], - seed: int | None, + seed: int, tensor_parallel_size: int | None, ): req_data = model_example_map[model](question, image_urls) @@ -1494,7 +1494,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) parser.add_argument( diff --git a/examples/online_serving/disaggregated_encoder/README.md b/examples/online_serving/disaggregated_encoder/README.md index 5813a3cecf73b..b2c3bb974dfab 100644 --- a/examples/online_serving/disaggregated_encoder/README.md +++ b/examples/online_serving/disaggregated_encoder/README.md @@ -50,12 +50,12 @@ The vllm instances and `disagg_encoder_proxy` supports local URIs with ```{"url" ## EC connector and KV transfer -The `ECSharedStorageConnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration: +The `ECExampleonnector` is used to store the encoder cache on local disk and facilitate transfer. To enable the encoder disaggregation feature, add the following configuration: ```bash # Add to encoder instance: --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -64,7 +64,7 @@ The `ECSharedStorageConnector` is used to store the encoder cache on local disk # Add to prefill/prefill+decode instance: --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh index 57489df64f51e..95a418374ad28 100644 --- a/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1p1d_example.sh @@ -102,7 +102,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -126,7 +126,7 @@ vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh index 6073e0580b11d..c4a591d7438cb 100644 --- a/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh +++ b/examples/online_serving/disaggregated_encoder/disagg_1e1pd_example.sh @@ -96,7 +96,7 @@ CUDA_VISIBLE_DEVICES="$GPU_E" vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -117,7 +117,7 @@ CUDA_VISIBLE_DEVICES="$GPU_PD" vllm serve "$MODEL" \ --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py index 5d8e38c73b89a..c8965e050ff0b 100644 --- a/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py +++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py @@ -26,9 +26,21 @@ async def lifespan(app: FastAPI): ) app.state.prefill_client = httpx.AsyncClient( - timeout=None, base_url=prefiller_base_url + timeout=None, + base_url=prefiller_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ) + app.state.decode_client = httpx.AsyncClient( + timeout=None, + base_url=decoder_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), ) - app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url) yield @@ -105,6 +117,11 @@ async def send_request_to_service( headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} response = await client.post(endpoint, json=req_data, headers=headers) response.raise_for_status() + + # read/consume the response body to release the connection + # otherwise, it would http.ReadError + await response.aread() + return response diff --git a/examples/pooling/plugin/prithvi_geospatial_mae_client.py b/examples/pooling/plugin/prithvi_geospatial_mae_client.py index a6246999c14d6..1ba1fd6a92ca4 100644 --- a/examples/pooling/plugin/prithvi_geospatial_mae_client.py +++ b/examples/pooling/plugin/prithvi_geospatial_mae_client.py @@ -16,7 +16,7 @@ import requests # - start vllm in serving mode with the below args # --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM' # --model-impl terratorch -# --task embed --trust-remote-code +# --trust-remote-code # --skip-tokenizer-init --enforce-eager # --io-processor-plugin terratorch_segmentation # --enable-mm-embeds diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py index 530aad4bc031c..dda56bc34df2e 100644 --- a/examples/pooling/pooling/vision_language_pooling.py +++ b/examples/pooling/pooling/vision_language_pooling.py @@ -305,7 +305,7 @@ def get_query(modality: QueryModality): raise ValueError(msg) -def run_encode(model: str, modality: QueryModality, seed: int | None): +def run_encode(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -335,7 +335,7 @@ def run_encode(model: str, modality: QueryModality, seed: int | None): print("-" * 50) -def run_score(model: str, modality: QueryModality, seed: int | None): +def run_score(model: str, modality: QueryModality, seed: int): query = get_query(modality) req_data = model_example_map[model](query) @@ -390,7 +390,7 @@ def parse_args(): parser.add_argument( "--seed", type=int, - default=None, + default=0, help="Set the seed when initializing `vllm.LLM`.", ) return parser.parse_args() diff --git a/examples/pooling/score/qwen3_reranker.py b/examples/pooling/score/offline_reranker.py similarity index 100% rename from examples/pooling/score/qwen3_reranker.py rename to examples/pooling/score/offline_reranker.py diff --git a/examples/pooling/score/jinaai_rerank_client.py b/examples/pooling/score/openai_reranker.py similarity index 100% rename from examples/pooling/score/jinaai_rerank_client.py rename to examples/pooling/score/openai_reranker.py diff --git a/mkdocs.yaml b/mkdocs.yaml index bf97093dafb11..8fb8f0568c6ef 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -51,6 +51,7 @@ hooks: - docs/mkdocs/hooks/remove_announcement.py - docs/mkdocs/hooks/generate_examples.py - docs/mkdocs/hooks/generate_argparse.py + - docs/mkdocs/hooks/generate_metrics.py - docs/mkdocs/hooks/url_schemes.py plugins: diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt index 083230c171096..f60a01a55d07c 100644 --- a/requirements/kv_connectors.txt +++ b/requirements/kv_connectors.txt @@ -1,2 +1,2 @@ -lmcache +lmcache >= 0.3.10.post1 nixl >= 0.7.1 # Required for disaggregated prefill diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index f25835c68ddcf..3f0fd235fba50 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -75,7 +75,7 @@ torchgeo==0.7.0 mteb==2.1.2 # Data processing -xgrammar==0.1.27 +xgrammar @ git+https://github.com/divakar-amd/xgrammar@3272f7c520564858056a60480d5afdf69ae79c84 # Test async scheduling # Utilities diff --git a/tests/compile/test_compile_ranges.py b/tests/compile/test_compile_ranges.py index d849a8617ebd2..14ae8233f1131 100644 --- a/tests/compile/test_compile_ranges.py +++ b/tests/compile/test_compile_ranges.py @@ -80,6 +80,8 @@ def test_compile_ranges(use_fresh_inductor_cache): vllm_config = VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, @@ -112,6 +114,8 @@ def test_compile_config_get_compile_ranges(): VllmConfig( scheduler_config=SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ), compilation_config=compilation_config, ) @@ -134,6 +138,8 @@ def test_inductor_cache_compile_ranges(monkeypatch, use_fresh_inductor_cache): ) scheduler_config = SchedulerConfig( max_num_batched_tokens=8192, + max_model_len=8192, + is_encoder_decoder=False, ) torch.set_default_device("cuda") diff --git a/tests/compile/test_config.py b/tests/compile/test_config.py index 0e91cf525411e..04bb56ecb6470 100644 --- a/tests/compile/test_config.py +++ b/tests/compile/test_config.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import copy -import logging from contextlib import nullcontext from unittest.mock import patch @@ -13,7 +12,6 @@ from vllm.compilation.fix_functionalization import FixFunctionalizationPass from vllm.config import CompilationConfig, CUDAGraphMode, ParallelConfig, VllmConfig from vllm.config.compilation import CompilationMode, PassConfig from vllm.engine.arg_utils import EngineArgs -from vllm.logger import _print_warning_once from vllm.platforms import current_platform from vllm.utils.torch_utils import _is_torch_equal_or_newer @@ -290,7 +288,7 @@ def test_moe_splitting_ops_deepep_ht_attn_fusion_no_inductor(): ), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, - pass_config={"enable_attn_fusion": True, "enable_noop": True}, + pass_config={"fuse_attn_quant": True, "eliminate_noops": True}, custom_ops=["+quant_fp8"], cudagraph_mode=CUDAGraphMode.PIECEWISE, ), @@ -442,62 +440,3 @@ def test_cudagraph_sizes_post_init( vllm_config.compilation_config.max_cudagraph_capture_size == expected_max_size ) - - -def test_pass_config_deprecation(caplog_vllm): - caplog_vllm.set_level(logging.WARNING) - - # Clear cache to ensure warnings are re-issued - _print_warning_once.cache_clear() - - # Test enable_fusion -> fuse_norm_quant, fuse_act_quant - caplog_vllm.clear() - config = PassConfig(enable_fusion=True) - assert "enable_fusion is deprecated" in caplog_vllm.text - assert config.fuse_norm_quant is True - assert config.fuse_act_quant is True - assert config.enable_fusion is True - - # Test enable_attn_fusion -> fuse_attn_quant - caplog_vllm.clear() - config = PassConfig(enable_attn_fusion=True) - assert "enable_attn_fusion is deprecated" in caplog_vllm.text - assert config.fuse_attn_quant is True - assert config.enable_attn_fusion is True - - # Test enable_noop -> eliminate_noops - caplog_vllm.clear() - config = PassConfig(enable_noop=True) - assert "enable_noop is deprecated" in caplog_vllm.text - assert config.eliminate_noops is True - assert config.enable_noop is True - - # Test enable_sequence_parallelism -> enable_sp - caplog_vllm.clear() - config = PassConfig(enable_sequence_parallelism=True) - assert "enable_sequence_parallelism is deprecated" in caplog_vllm.text - assert config.enable_sp is True - assert config.enable_sequence_parallelism is True - - # Test enable_async_tp -> fuse_gemm_comms - caplog_vllm.clear() - config = PassConfig(enable_async_tp=True) - assert "enable_async_tp is deprecated" in caplog_vllm.text - assert config.fuse_gemm_comms is True - assert config.enable_async_tp is True - - # Test enable_fi_allreduce_fusion -> fuse_allreduce_rms - caplog_vllm.clear() - config = PassConfig(enable_fi_allreduce_fusion=True) - assert "enable_fi_allreduce_fusion is deprecated" in caplog_vllm.text - assert config.fuse_allreduce_rms is True - assert config.enable_fi_allreduce_fusion is True - - # Test hash consistency - config_old = PassConfig(enable_fusion=True) - config_new = PassConfig(fuse_norm_quant=True, fuse_act_quant=True) - assert config_old.compute_hash() == config_new.compute_hash() - - config_old = PassConfig(enable_async_tp=True) - config_new = PassConfig(fuse_gemm_comms=True) - assert config_old.compute_hash() == config_new.compute_hash() diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index 2ad34a79859a3..6b72c595cd779 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -1,10 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import itertools + import pytest import torch import vllm.plugins +from vllm._aiter_ops import IS_AITER_FOUND, rocm_aiter_ops from vllm.compilation.fusion import FUSED_OPS, FusedRMSQuantKey, RMSNormQuantFusionPass from vllm.compilation.fx_utils import find_op_nodes from vllm.compilation.matcher_utils import QUANT_OPS @@ -152,13 +155,79 @@ GROUP_SHAPES = [ ] +class TestRmsnormGroupFp8QuantModel(torch.nn.Module): + def __init__(self, hidden_size: int, eps: float, **kwargs): + super().__init__() + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(128, 128), + act_quant_group_shape=GroupShape(1, 128), + cutlass_block_fp8_supported=False, + use_aiter_and_is_supported=True, + ) + self.w = [ + torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() + for _ in range(3) + ] + + scale_hidden_size = (hidden_size + 128 - 1) // 128 + self.wscale = [ + torch.rand((scale_hidden_size, scale_hidden_size), dtype=torch.float32) + for _ in range(3) + ] + + self.norm_weight = [torch.ones(hidden_size) for _ in range(4)] + self.eps = eps + + def forward(self, x): + # avoid having graph input be an arg to a pattern directly + x = resid = torch.relu(x) + y = rocm_aiter_ops.rms_norm(x, self.norm_weight[0], self.eps) + + x2 = self.w8a8_block_fp8_linear.apply(y, self.w[0], self.wscale[0]) + # make sure resid is used for replacement to work + y2, resid = rocm_aiter_ops.rms_norm2d_with_add( + x2, resid, self.norm_weight[1], self.eps + ) + + x3 = self.w8a8_block_fp8_linear.apply(y2, self.w[1], self.wscale[1]) + + y3, resid = rocm_aiter_ops.rms_norm2d_with_add( + x3, resid, self.norm_weight[2], self.eps + ) + + x4 = self.w8a8_block_fp8_linear.apply(y3, self.w[2], self.wscale[2]) + + y4, resid = rocm_aiter_ops.rms_norm2d_with_add( + x4, resid, self.norm_weight[3], self.eps + ) + return y4 + + def ops_in_model_before(self): + return [ + torch.ops.vllm.rocm_aiter_rms_norm, + torch.ops.vllm.rocm_aiter_group_fp8_quant, + ] + + def ops_in_model_before_partial(self): + return [] + + def ops_in_model_after(self): + return [ + torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant, + torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant, + ] + + @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("hidden_size", [256]) @pytest.mark.parametrize("num_tokens", [257]) @pytest.mark.parametrize("eps", [1e-5, 1e-6]) @pytest.mark.parametrize("group_shape", GROUP_SHAPES) -@pytest.mark.parametrize("enable_rms_norm_custom_op", [True, False]) -@pytest.mark.parametrize("enable_quant_fp8_custom_op", [True, False]) +@pytest.mark.parametrize( + "model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op", + list(itertools.product([TestModel], [True, False], [True, False])) + + [(TestRmsnormGroupFp8QuantModel, False, False)], +) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @pytest.mark.parametrize( @@ -173,10 +242,14 @@ def test_fusion_rmsnorm_quant( num_tokens, eps, group_shape, + model_class, enable_rms_norm_custom_op, enable_quant_fp8_custom_op, cuda_force_torch, ): + if model_class is TestRmsnormGroupFp8QuantModel and not IS_AITER_FOUND: + pytest.skip("AITER is not supported on this GPU.") + torch.set_default_device("cuda") torch.set_default_dtype(dtype) torch.manual_seed(1) @@ -209,12 +282,24 @@ def test_fusion_rmsnorm_quant( with vllm.config.set_current_vllm_config(vllm_config): # Reshape pass is needed for the fusion pass to work noop_pass = NoOpEliminationPass(vllm_config) - fusion_pass = RMSNormQuantFusionPass(vllm_config) + if model_class is TestRmsnormGroupFp8QuantModel: + from vllm.compilation.rocm_aiter_fusion import ( + RocmAiterRMSNormFp8GroupQuantFusionPass, + ) + + fusion_pass = RocmAiterRMSNormFp8GroupQuantFusionPass(vllm_config) + else: + fusion_pass = RMSNormQuantFusionPass(vllm_config) cleanup_pass = PostCleanupPass(vllm_config) backend = TestBackend(noop_pass, fusion_pass, cleanup_pass) backend2 = TestBackend(noop_pass, cleanup_pass) - model = TestModel(hidden_size, eps, group_shape, cuda_force_torch) + model = model_class( + hidden_size=hidden_size, + eps=eps, + group_shape=group_shape, + cuda_force_torch=cuda_force_torch, + ) # First dimension dynamic x = torch.rand(num_tokens, hidden_size) torch._dynamo.mark_dynamic(x, 0) @@ -243,7 +328,10 @@ def test_fusion_rmsnorm_quant( # there's a risk that the fused add doesn't get included in the # replacement and only the rms part gets fused with quant. # Hence, we check only 2 add nodes are left (final fused rmsnorm add). - if not enable_rms_norm_custom_op: + if ( + not enable_rms_norm_custom_op + and model_class is not TestRmsnormGroupFp8QuantModel + ): n_add_nodes = lambda g: sum(1 for _ in find_op_nodes(torch.ops.aten.add, g)) # 7 = 1 (RMS) + 3x2 (3xRMS_ADD, 2 each) assert n_add_nodes(backend.graph_pre_pass) == 7 diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 6d0ba6b655031..6ed77b0085f51 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -5,9 +5,14 @@ import copy import pytest import torch -from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.compilation.inductor_pass import ( + CallableInductorPass, + InductorPass, + pass_context, +) from vllm.compilation.pass_manager import PostGradPassManager from vllm.config import ModelConfig, VllmConfig +from vllm.config.utils import Range # dummy custom pass that doesn't inherit @@ -42,35 +47,37 @@ class ProperPass(InductorPass): ], ) def test_pass_manager_uuid(callable): - # Some passes need dtype to be set - config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) + # Set the pass context as PassManager uuid uses it + with pass_context(Range(start=1, end=8)): + # Some passes need dtype to be set + config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) - pass_manager = PostGradPassManager() - pass_manager.configure(config) + pass_manager = PostGradPassManager() + pass_manager.configure(config) - # Check that UUID is different if the same pass is added 2x - pass_manager.add(callable) - uuid1 = pass_manager.uuid() - pass_manager.add(callable) - uuid2 = pass_manager.uuid() - assert uuid1 != uuid2 + # Check that UUID is different if the same pass is added 2x + pass_manager.add(callable) + uuid1 = pass_manager.uuid() + pass_manager.add(callable) + uuid2 = pass_manager.uuid() + assert uuid1 != uuid2 - # UUID should be the same as the original one, - # as we constructed in the same way. - pass_manager2 = PostGradPassManager() - pass_manager2.configure(config) - pass_manager2.add(callable) - assert uuid1 == pass_manager2.uuid() + # UUID should be the same as the original one, + # as we constructed in the same way. + pass_manager2 = PostGradPassManager() + pass_manager2.configure(config) + pass_manager2.add(callable) + assert uuid1 == pass_manager2.uuid() - # UUID should be different due to config change - config2 = copy.deepcopy(config) - config2.compilation_config.pass_config.fuse_norm_quant = ( - not config2.compilation_config.pass_config.fuse_norm_quant - ) - config2.compilation_config.pass_config.fuse_act_quant = ( - not config2.compilation_config.pass_config.fuse_act_quant - ) - pass_manager3 = PostGradPassManager() - pass_manager3.configure(config2) - pass_manager3.add(callable) - assert uuid1 != pass_manager3.uuid() + # UUID should be different due to config change + config2 = copy.deepcopy(config) + config2.compilation_config.pass_config.fuse_norm_quant = ( + not config2.compilation_config.pass_config.fuse_norm_quant + ) + config2.compilation_config.pass_config.fuse_act_quant = ( + not config2.compilation_config.pass_config.fuse_act_quant + ) + pass_manager3 = PostGradPassManager() + pass_manager3.configure(config2) + pass_manager3.add(callable) + assert uuid1 != pass_manager3.uuid() diff --git a/tests/compile/test_silu_mul_quant_fusion.py b/tests/compile/test_silu_mul_quant_fusion.py index c336a45955cb5..eb0dee8d4e399 100644 --- a/tests/compile/test_silu_mul_quant_fusion.py +++ b/tests/compile/test_silu_mul_quant_fusion.py @@ -7,6 +7,7 @@ import torch import vllm.envs as envs from tests.kernels.quantization.nvfp4_utils import quant_nvfp4_tensor +from vllm._aiter_ops import IS_AITER_FOUND from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant from vllm.compilation.activation_quant_fusion import ( FUSED_OPS, @@ -24,6 +25,7 @@ from vllm.config import ( set_current_vllm_config, ) from vllm.model_executor.layers.activation import SiluAndMul +from vllm.model_executor.layers.quantization.utils.fp8_utils import W8A8BlockFp8LinearOp from vllm.model_executor.layers.quantization.utils.quant_utils import ( GroupShape, kFp8StaticTensorSym, @@ -126,6 +128,39 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): return [FUSED_OPS[kNvfp4Quant]] +class TestSiluMulGroupFp8QuantModel(torch.nn.Module): + def __init__(self, hidden_size: int, **kwargs): + super().__init__() + self.silu_and_mul = SiluAndMul() + self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp( + weight_group_shape=GroupShape(128, 128), + act_quant_group_shape=GroupShape(1, 128), + cutlass_block_fp8_supported=False, + use_aiter_and_is_supported=True, + ) + self.w = torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t() + + scale_hidden_size = (hidden_size + 128 - 1) // 128 + self.wscale = torch.rand( + (scale_hidden_size, scale_hidden_size), dtype=torch.float32 + ) + + self.enable_silu_mul_custom_op = self.silu_and_mul.enabled() + + def forward(self, x): + y = self.silu_and_mul(x) + x2 = self.w8a8_block_fp8_linear.apply(y, self.w, self.wscale) + return x2 + + def ops_in_model_before(self): + return [ + SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul, + ] + + def ops_in_model_after(self): + return [torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant] + + @pytest.mark.parametrize("num_tokens", [32, 64]) @pytest.mark.parametrize("hidden_size", [128, 256]) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16]) @@ -133,7 +168,10 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module): @pytest.mark.parametrize( "model_class, enable_quant_fp8_custom_op, cuda_force_torch", list(itertools.product([TestSiluMulFp8QuantModel], [True, False], [True, False])) - + [(TestSiluMulNvfp4QuantModel, False, False)], + + [ + (TestSiluMulNvfp4QuantModel, False, False), + (TestSiluMulGroupFp8QuantModel, False, False), + ], ) # cuda_force_torch used to test torch code path on platforms that # cutlass_fp8_supported() == True. @@ -144,13 +182,19 @@ def test_fusion_silu_and_mul_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, - model_class: type[TestSiluMulFp8QuantModel | TestSiluMulNvfp4QuantModel], + model_class: type[ + TestSiluMulFp8QuantModel + | TestSiluMulNvfp4QuantModel + | TestSiluMulGroupFp8QuantModel + ], enable_silu_mul_custom_op: bool, enable_quant_fp8_custom_op: bool, cuda_force_torch: bool, ): if model_class is TestSiluMulNvfp4QuantModel and not is_nvfp4_supported(): pytest.skip("NVFP4 is not supported on this GPU.") + if model_class is TestSiluMulGroupFp8QuantModel and not IS_AITER_FOUND: + pytest.skip("AITER is not supported on this GPU.") torch.set_default_device("cuda") torch.set_default_dtype(dtype) @@ -173,9 +217,15 @@ def test_fusion_silu_and_mul_quant( ) with set_current_vllm_config(config): - fusion_pass = ActivationQuantFusionPass(config) + fusion_passes = [ActivationQuantFusionPass(config)] + if IS_AITER_FOUND: + from vllm.compilation.rocm_aiter_fusion import ( + RocmAiterSiluMulFp8GroupQuantFusionPass, + ) - passes = [NoOpEliminationPass(config), fusion_pass, PostCleanupPass(config)] + fusion_passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] + + passes = [NoOpEliminationPass(config), *fusion_passes, PostCleanupPass(config)] backend = TestBackend(*passes) model = model_class( hidden_size=hidden_size, cuda_force_torch=cuda_force_torch, x=x @@ -194,12 +244,14 @@ def test_fusion_silu_and_mul_quant( atol, rtol = 1e-3, 1e-3 elif model_class == TestSiluMulNvfp4QuantModel: atol, rtol = 1e-1, 1e-1 + elif model_class == TestSiluMulGroupFp8QuantModel: + atol, rtol = 5e-2, 5e-2 torch.testing.assert_close( result[0].to(dtype=dtype), result2[0].to(dtype=dtype), atol=atol, rtol=rtol ) - assert fusion_pass.matched_count == 1 + assert sum([p.matched_count for p in fusion_passes]) == 1 # In pre-nodes, quant op should be present and fused kernels should not backend.check_before_ops(model.ops_in_model_before()) diff --git a/tests/conftest.py b/tests/conftest.py index e4c2384e874f1..65901a6378da2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -739,7 +739,7 @@ class VllmRunner: tokenizer_name: str | None = None, tokenizer_mode: str = "auto", trust_remote_code: bool = True, - seed: int | None = 0, + seed: int = 0, max_model_len: int | None = 1024, dtype: str = "auto", disable_log_stats: bool = True, diff --git a/tests/distributed/test_context_parallel.py b/tests/distributed/test_context_parallel.py index 3cb533dccd62c..aa47f28a34dd5 100644 --- a/tests/distributed/test_context_parallel.py +++ b/tests/distributed/test_context_parallel.py @@ -123,8 +123,11 @@ class CPTestSettings: CP_TEXT_GENERATION_MODELS = { "deepseek-ai/DeepSeek-V2-Lite-Chat": [ + CPTestSettings.detailed(dcp_multipliers=[1]), CPTestSettings.detailed( - dcp_multipliers=[0.5, 1], cp_kv_cache_interleave_size=64 + dcp_multipliers=[0.5], + cp_kv_cache_interleave_size=64, + attn_backend="FLASHMLA", ), ], "Qwen/Qwen2.5-1.5B-Instruct": [ diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py index b190b2820451b..c8177f1c7c2ff 100644 --- a/tests/distributed/test_kvlayout.py +++ b/tests/distributed/test_kvlayout.py @@ -61,7 +61,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector(): kv_role="kv_both", kv_connector_extra_config={ "connectors": [ - {"kv_connector": "SharedStorageConnector", "kv_role": "kv_both"}, + {"kv_connector": "ExampleConnector", "kv_role": "kv_both"}, {"kv_connector": "NixlConnector", "kv_role": "kv_both"}, ] }, diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py index e46f118f8e846..c2cf77ffa12b6 100644 --- a/tests/engine/test_arg_utils.py +++ b/tests/engine/test_arg_utils.py @@ -350,21 +350,35 @@ def test_human_readable_model_len(): assert args.max_model_len == 1_000_000 args = parser.parse_args(["--max-model-len", "10k"]) assert args.max_model_len == 10_000 + args = parser.parse_args(["--max-model-len", "2g"]) + assert args.max_model_len == 2_000_000_000 + args = parser.parse_args(["--max-model-len", "2t"]) + assert args.max_model_len == 2_000_000_000_000 # Capital args = parser.parse_args(["--max-model-len", "3K"]) - assert args.max_model_len == 1024 * 3 + assert args.max_model_len == 2**10 * 3 args = parser.parse_args(["--max-model-len", "10M"]) assert args.max_model_len == 2**20 * 10 + args = parser.parse_args(["--max-model-len", "4G"]) + assert args.max_model_len == 2**30 * 4 + args = parser.parse_args(["--max-model-len", "4T"]) + assert args.max_model_len == 2**40 * 4 # Decimal values args = parser.parse_args(["--max-model-len", "10.2k"]) assert args.max_model_len == 10200 # ..truncated to the nearest int - args = parser.parse_args(["--max-model-len", "10.212345k"]) + args = parser.parse_args(["--max-model-len", "10.2123451234567k"]) assert args.max_model_len == 10212 + args = parser.parse_args(["--max-model-len", "10.2123451234567m"]) + assert args.max_model_len == 10212345 + args = parser.parse_args(["--max-model-len", "10.2123451234567g"]) + assert args.max_model_len == 10212345123 + args = parser.parse_args(["--max-model-len", "10.2123451234567t"]) + assert args.max_model_len == 10212345123456 # Invalid (do not allow decimals with binary multipliers) - for invalid in ["1a", "pwd", "10.24", "1.23M"]: + for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]: with pytest.raises(ArgumentError): - args = parser.parse_args(["--max-model-len", invalid]) + parser.parse_args(["--max-model-len", invalid]) diff --git a/tests/entrypoints/openai/test_chat_error.py b/tests/entrypoints/openai/test_chat_error.py new file mode 100644 index 0000000000000..102eeaf614410 --- /dev/null +++ b/tests/entrypoints/openai/test_chat_error.py @@ -0,0 +1,228 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.config.multimodal import MultiModalConfig +from vllm.entrypoints.openai.protocol import ChatCompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_chat import OpenAIServingChat +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM + +MODEL_NAME = "openai-community/gpt2" +MODEL_NAME_SHORT = "gpt2" +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT), +] + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + task = "generate" + runner_type = "generate" + tokenizer = MODEL_NAME + trust_remote_code = False + tokenizer_mode = "auto" + max_model_len = 100 + tokenizer_revision = None + multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() + logits_processor_pattern = None + logits_processors: list[str] | None = None + diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + encoder_config = None + generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: + models = OpenAIServingModels( + engine_client=engine, + base_model_paths=BASE_MODEL_PATHS, + ) + serving_chat = OpenAIServingChat( + engine, + models, + response_role="assistant", + request_logger=None, + chat_template=None, + chat_template_content_format="auto", + ) + + async def _fake_process_inputs( + request_id, + engine_prompt, + sampling_params, + *, + lora_request, + trace_headers, + priority, + ): + return dict(engine_prompt), {} + + async def _fake_preprocess_chat(*args, **kwargs): + # return conversation, request_prompts, engine_prompts + return ( + [{"role": "user", "content": "Test"}], + [[1, 2, 3]], + [{"prompt_token_ids": [1, 2, 3]}], + ) + + serving_chat._process_inputs = AsyncMock(side_effect=_fake_process_inputs) + serving_chat._preprocess_chat = AsyncMock(side_effect=_fake_preprocess_chat) + return serving_chat + + +@pytest.mark.asyncio +async def test_chat_error_non_stream(): + """test finish_reason='error' returns 500 InternalServerError (non-streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_chat = _build_serving_chat(mock_engine) + + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "Test prompt"}], + max_tokens=10, + stream=False, + ) + + response = await serving_chat.create_chat_completion(request) + + assert isinstance(response, ErrorResponse) + assert response.error.type == "InternalServerError" + assert response.error.message == "Internal server error" + assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_chat_error_stream(): + """test finish_reason='error' returns 500 InternalServerError (streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_chat = _build_serving_chat(mock_engine) + + completion_output_1 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ) + + request_output_1 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_1], + finished=False, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + completion_output_2 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output_2 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_2], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output_1 + yield request_output_2 + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = ChatCompletionRequest( + model=MODEL_NAME, + messages=[{"role": "user", "content": "Test prompt"}], + max_tokens=10, + stream=True, + ) + + response = await serving_chat.create_chat_completion(request) + + chunks = [] + async for chunk in response: + chunks.append(chunk) + + assert len(chunks) >= 2 + assert any("Internal server error" in chunk for chunk in chunks), ( + f"Expected error message in chunks: {chunks}" + ) + assert chunks[-1] == "data: [DONE]\n\n" diff --git a/tests/entrypoints/openai/test_completion_error.py b/tests/entrypoints/openai/test_completion_error.py new file mode 100644 index 0000000000000..ca56cc2ddb6a7 --- /dev/null +++ b/tests/entrypoints/openai/test_completion_error.py @@ -0,0 +1,216 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from dataclasses import dataclass, field +from http import HTTPStatus +from typing import Any +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from vllm.config.multimodal import MultiModalConfig +from vllm.entrypoints.openai.protocol import CompletionRequest, ErrorResponse +from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion +from vllm.entrypoints.openai.serving_models import BaseModelPath, OpenAIServingModels +from vllm.outputs import CompletionOutput, RequestOutput +from vllm.transformers_utils.tokenizer import get_tokenizer +from vllm.v1.engine.async_llm import AsyncLLM + +MODEL_NAME = "openai-community/gpt2" +MODEL_NAME_SHORT = "gpt2" +BASE_MODEL_PATHS = [ + BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME), + BaseModelPath(name=MODEL_NAME_SHORT, model_path=MODEL_NAME_SHORT), +] + + +@dataclass +class MockHFConfig: + model_type: str = "any" + + +@dataclass +class MockModelConfig: + task = "generate" + runner_type = "generate" + tokenizer = MODEL_NAME + trust_remote_code = False + tokenizer_mode = "auto" + max_model_len = 100 + tokenizer_revision = None + multimodal_config = MultiModalConfig() + hf_config = MockHFConfig() + logits_processor_pattern = None + logits_processors: list[str] | None = None + diff_sampling_param: dict | None = None + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + encoder_config = None + generation_config: str = "auto" + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + skip_tokenizer_init = False + + def get_diff_sampling_param(self): + return self.diff_sampling_param or {} + + +def _build_serving_completion(engine: AsyncLLM) -> OpenAIServingCompletion: + models = OpenAIServingModels( + engine_client=engine, + base_model_paths=BASE_MODEL_PATHS, + ) + serving_completion = OpenAIServingCompletion( + engine, + models, + request_logger=None, + ) + + async def _fake_process_inputs( + request_id, + engine_prompt, + sampling_params, + *, + lora_request, + trace_headers, + priority, + ): + return dict(engine_prompt), {} + + serving_completion._process_inputs = AsyncMock(side_effect=_fake_process_inputs) + return serving_completion + + +@pytest.mark.asyncio +async def test_completion_error_non_stream(): + """test finish_reason='error' returns 500 InternalServerError (non-streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_completion = _build_serving_completion(mock_engine) + + completion_output = CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = CompletionRequest( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=10, + stream=False, + ) + + response = await serving_completion.create_completion(request) + + assert isinstance(response, ErrorResponse) + assert response.error.type == "InternalServerError" + assert response.error.message == "Internal server error" + assert response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_completion_error_stream(): + """test finish_reason='error' returns 500 InternalServerError (streaming)""" + mock_engine = MagicMock(spec=AsyncLLM) + mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) + mock_engine.errored = False + mock_engine.model_config = MockModelConfig() + mock_engine.input_processor = MagicMock() + mock_engine.io_processor = MagicMock() + + serving_completion = _build_serving_completion(mock_engine) + + completion_output_1 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason=None, + ) + + request_output_1 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_1], + finished=False, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + completion_output_2 = CompletionOutput( + index=0, + text="Hello", + token_ids=[100], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + + request_output_2 = RequestOutput( + request_id="test-id", + prompt="Test prompt", + prompt_token_ids=[1, 2, 3], + prompt_logprobs=None, + outputs=[completion_output_2], + finished=True, + metrics=None, + lora_request=None, + encoder_prompt=None, + encoder_prompt_token_ids=None, + ) + + async def mock_generate(*args, **kwargs): + yield request_output_1 + yield request_output_2 + + mock_engine.generate = MagicMock(side_effect=mock_generate) + + request = CompletionRequest( + model=MODEL_NAME, + prompt="Test prompt", + max_tokens=10, + stream=True, + ) + + response = await serving_completion.create_completion(request) + + chunks = [] + async for chunk in response: + chunks.append(chunk) + + assert len(chunks) >= 2 + assert any("Internal server error" in chunk for chunk in chunks), ( + f"Expected error message in chunks: {chunks}" + ) + assert chunks[-1] == "data: [DONE]\n\n" diff --git a/tests/entrypoints/openai/test_responses_error.py b/tests/entrypoints/openai/test_responses_error.py new file mode 100644 index 0000000000000..f8ea178288835 --- /dev/null +++ b/tests/entrypoints/openai/test_responses_error.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from http import HTTPStatus +from unittest.mock import MagicMock + +import pytest + +from vllm.entrypoints.openai.protocol import ErrorResponse +from vllm.entrypoints.openai.serving_engine import GenerationError, OpenAIServing + + +@pytest.mark.asyncio +async def test_raise_if_error_raises_generation_error(): + """test _raise_if_error raises GenerationError""" + # create a minimal OpenAIServing instance + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # test that error finish_reason raises GenerationError + with pytest.raises(GenerationError) as exc_info: + serving._raise_if_error("error", "test-request-id") + + assert str(exc_info.value) == "Internal server error" + assert exc_info.value.status_code == HTTPStatus.INTERNAL_SERVER_ERROR + + # test that other finish_reasons don't raise + serving._raise_if_error("stop", "test-request-id") # should not raise + serving._raise_if_error("length", "test-request-id") # should not raise + serving._raise_if_error(None, "test-request-id") # should not raise + + +@pytest.mark.asyncio +async def test_convert_generation_error_to_response(): + """test _convert_generation_error_to_response creates proper ErrorResponse""" + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # create a GenerationError + gen_error = GenerationError("Internal server error") + + # convert to ErrorResponse + error_response = serving._convert_generation_error_to_response(gen_error) + + assert isinstance(error_response, ErrorResponse) + assert error_response.error.type == "InternalServerError" + assert error_response.error.message == "Internal server error" + assert error_response.error.code == HTTPStatus.INTERNAL_SERVER_ERROR + + +@pytest.mark.asyncio +async def test_convert_generation_error_to_streaming_response(): + """test _convert_generation_error_to_streaming_response output""" + mock_engine = MagicMock() + mock_engine.model_config = MagicMock() + mock_engine.model_config.max_model_len = 100 + mock_models = MagicMock() + + serving = OpenAIServing( + engine_client=mock_engine, + models=mock_models, + request_logger=None, + ) + + # create a GenerationError + gen_error = GenerationError("Internal server error") + + # convert to streaming error response + error_json = serving._convert_generation_error_to_streaming_response(gen_error) + + assert isinstance(error_json, str) + assert "Internal server error" in error_json + assert "InternalServerError" in error_json diff --git a/tests/entrypoints/test_responses_utils.py b/tests/entrypoints/test_responses_utils.py index 3951bd4840085..a522967111307 100644 --- a/tests/entrypoints/test_responses_utils.py +++ b/tests/entrypoints/test_responses_utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from openai.types.responses.response_function_tool_call import ResponseFunctionToolCall from openai.types.responses.response_function_tool_call_output_item import ( ResponseFunctionToolCallOutputItem, ) @@ -14,7 +15,8 @@ from openai.types.responses.response_reasoning_item import ( ) from vllm.entrypoints.responses_utils import ( - construct_chat_message_with_tool_call, + _construct_single_message_from_response_item, + construct_chat_messages_with_tool_call, convert_tool_responses_to_completions_format, ) @@ -42,7 +44,43 @@ class TestResponsesUtils: assert result == {"type": "function", "function": input_tool} - def test_construct_chat_message_with_tool_call(self): + def test_construct_chat_messages_with_tool_call(self): + """Test construction of chat messages with tool calls.""" + reasoning_item = ResponseReasoningItem( + id="lol", + summary=[], + type="reasoning", + content=[ + Content( + text="Leroy Jenkins", + type="reasoning_text", + ) + ], + encrypted_content=None, + status=None, + ) + mcp_tool_item = ResponseFunctionToolCall( + id="mcp_123", + call_id="call_123", + type="function_call", + status="completed", + name="python", + arguments='{"code": "123+456"}', + ) + input_items = [reasoning_item, mcp_tool_item] + messages = construct_chat_messages_with_tool_call(input_items) + + assert len(messages) == 1 + message = messages[0] + assert message["role"] == "assistant" + assert message["reasoning"] == "Leroy Jenkins" + assert message["tool_calls"][0]["id"] == "call_123" + assert message["tool_calls"][0]["function"]["name"] == "python" + assert ( + message["tool_calls"][0]["function"]["arguments"] == '{"code": "123+456"}' + ) + + def test_construct_single_message_from_response_item(self): item = ResponseReasoningItem( id="lol", summary=[], @@ -56,7 +94,7 @@ class TestResponsesUtils: encrypted_content=None, status=None, ) - formatted_item = construct_chat_message_with_tool_call(item) + formatted_item = _construct_single_message_from_response_item(item) assert formatted_item["role"] == "assistant" assert formatted_item["reasoning"] == "Leroy Jenkins" @@ -74,7 +112,7 @@ class TestResponsesUtils: status=None, ) - formatted_item = construct_chat_message_with_tool_call(item) + formatted_item = _construct_single_message_from_response_item(item) assert formatted_item["role"] == "assistant" assert ( formatted_item["reasoning"] @@ -88,7 +126,7 @@ class TestResponsesUtils: output="1234", status="completed", ) - formatted_item = construct_chat_message_with_tool_call(tool_call_output) + formatted_item = _construct_single_message_from_response_item(tool_call_output) assert formatted_item["role"] == "tool" assert formatted_item["content"] == "1234" assert formatted_item["tool_call_id"] == "temp" @@ -102,7 +140,7 @@ class TestResponsesUtils: status=None, ) with pytest.raises(ValueError): - construct_chat_message_with_tool_call(item) + _construct_single_message_from_response_item(item) output_item = ResponseOutputMessage( id="msg_bf585bbbe3d500e0", @@ -119,6 +157,6 @@ class TestResponsesUtils: type="message", ) - formatted_item = construct_chat_message_with_tool_call(output_item) + formatted_item = _construct_single_message_from_response_item(output_item) assert formatted_item["role"] == "assistant" assert formatted_item["content"] == "dongyi" diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py index fb3b1799ba48e..be5d66197f6ef 100644 --- a/tests/kernels/attention/test_cpu_attn.py +++ b/tests/kernels/attention/test_cpu_attn.py @@ -7,7 +7,8 @@ import math import pytest import torch -from vllm.platforms import current_platform +from vllm.platforms import CpuArchEnum, current_platform +from vllm.v1.attention.backends.cpu_attn import _get_attn_isa if not current_platform.is_cpu(): pytest.skip("skipping CPU-only tests", allow_module_level=True) @@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len) ] +def get_attn_isa( + block_size: int | None = None, + dtype: torch.dtype | None = None, +): + if block_size and dtype: + return _get_attn_isa(dtype, block_size) + else: + if current_platform.get_cpu_architecture() == CpuArchEnum.ARM: + return "neon" + elif torch._C._cpu._is_amx_tile_supported(): + return "amx" + else: + return "vec" + + # rand number generation takes too much time, cache rand tensors @functools.lru_cache(maxsize=128, typed=False) def tensor_cache( @@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16( ) +@pytest.mark.parametrize("seq_lens", SEQ_LENS) +@pytest.mark.parametrize("num_heads", NUM_HEADS) +@pytest.mark.parametrize("head_size", HEAD_SIZES) +@pytest.mark.parametrize("block_size", [96, 128]) +@pytest.mark.parametrize("sliding_window", SLIDING_WINDOWS) +@pytest.mark.parametrize("dtype", QTYPES) +@pytest.mark.parametrize("soft_cap", [None]) +@pytest.mark.parametrize("num_blocks", NUM_BLOCKS) +@pytest.mark.parametrize("use_alibi", [False]) +@pytest.mark.parametrize("use_sink", [False]) +@pytest.mark.parametrize("isa", ["neon"]) +@pytest.mark.skipif( + current_platform.get_cpu_architecture() != CpuArchEnum.ARM, + reason="Not an Arm CPU.", +) +def test_varlen_with_paged_kv_normal_neon( + seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], + head_size: int, + sliding_window: int | None, + dtype: torch.dtype, + block_size: int, + soft_cap: float | None, + num_blocks: int, + use_alibi: bool, + use_sink: bool, + isa: str, +) -> None: + varlen_with_paged_kv( + seq_lens=seq_lens, + num_heads=num_heads, + head_size=head_size, + sliding_window=sliding_window, + dtype=dtype, + block_size=block_size, + soft_cap=soft_cap, + num_blocks=num_blocks, + use_alibi=use_alibi, + use_sink=use_sink, + isa=isa, + ) + + @pytest.mark.parametrize("seq_lens", SEQ_LENS) @pytest.mark.parametrize("num_heads", NUM_HEADS) @pytest.mark.parametrize("head_size", [96]) @@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("use_alibi", [False]) @pytest.mark.parametrize("use_sink", [False]) -@pytest.mark.parametrize( - "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] -) +@pytest.mark.parametrize("isa", [get_attn_isa()]) def test_varlen_with_paged_kv_softcap( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], @@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("use_alibi", [True]) @pytest.mark.parametrize("use_sink", [False]) -@pytest.mark.parametrize( - "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] -) +@pytest.mark.parametrize("isa", [get_attn_isa()]) def test_varlen_with_paged_kv_alibi( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], @@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi( @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) @pytest.mark.parametrize("use_alibi", [False]) @pytest.mark.parametrize("use_sink", [True]) -@pytest.mark.parametrize( - "isa", ["amx"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] -) +@pytest.mark.parametrize("isa", [get_attn_isa()]) def test_varlen_with_paged_kv_sink( seq_lens: list[tuple[int, int]], num_heads: tuple[int, int], diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index ae3c63cc62d6b..639abdf6f0487 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -26,7 +26,14 @@ def clear_cache(): _cached_get_attn_backend.cache_clear() -@pytest.mark.parametrize("device", ["cpu", "hip", "cuda"]) +devices = ["cpu"] +if current_platform.is_cuda(): + devices.append("cuda") +if current_platform.is_rocm(): + devices.append("hip") + + +@pytest.mark.parametrize("device", devices) def test_mha_attn_platform(device: str): """ Test the attention selector between different platform and device. @@ -46,7 +53,7 @@ def test_mha_attn_platform(device: str): patch("vllm.model_executor.models.vision.current_platform", RocmPlatform()), ): attn = MultiHeadAttention(16, 64, scale=1) - assert attn.attn_backend == AttentionBackendEnum.TORCH_SDPA + assert attn.attn_backend == AttentionBackendEnum.FLASH_ATTN else: # Test CUDA with head_size=64 (divisible by 32) # - should use vLLM's FlashAttention diff --git a/tests/kernels/moe/test_ocp_mx_moe.py b/tests/kernels/moe/test_ocp_mx_moe.py index 91b508d4163cc..5a850dda4f6fd 100644 --- a/tests/kernels/moe/test_ocp_mx_moe.py +++ b/tests/kernels/moe/test_ocp_mx_moe.py @@ -70,12 +70,12 @@ def test_mxfp4_loading_and_execution_moe(vllm_runner, model_case: ModelCase): f"{torch.cuda.device_count()}" ) - # `cuda_graph_sizes=[16]` to reduce load time. + # `cudagraph_capture_sizes=[16]` to reduce load time. with vllm_runner( model_case.model_id, tensor_parallel_size=model_case.tp, load_format="dummy", - cuda_graph_sizes=[16], + cudagraph_capture_sizes=[16], ) as llm: # Disabled as check_model is broken: https://github.com/vllm-project/vllm/pull/18465#issuecomment-3329880562 # def check_model(model): diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index 830d43569e98b..e29f66dca313f 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -103,7 +103,7 @@ def ref_dynamic_per_tensor_fp8_quant( .clamp(fp8_traits_min, fp8_traits_max) .to(FP8_DTYPE) ) - return ref_out, ref_scale.view((1, 1)) + return ref_out, ref_scale.view(1) def native_w8a8_block_matmul( diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py index d0e4f6554a91f..32c77b9a01ece 100644 --- a/tests/kernels/quantization/test_block_fp8.py +++ b/tests/kernels/quantization/test_block_fp8.py @@ -54,6 +54,10 @@ def setup_cuda(): torch.set_default_device("cuda") +@pytest.mark.skipif( + current_platform.is_fp8_fnuz(), + reason="This platform supports e4m3fnuz, not e4m3fn.", +) @pytest.mark.parametrize( "num_tokens,d,dtype,group_size,seed", itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE, SEEDS), @@ -78,14 +82,14 @@ def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed): def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): torch.manual_seed(seed) factor_for_scale = 1e-2 - fp8_info = torch.finfo(torch.float8_e4m3fn) + fp8_info = torch.finfo(current_platform.fp8_dtype()) fp8_max, fp8_min = fp8_info.max, fp8_info.min A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype()) B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max - B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(current_platform.fp8_dtype()) block_n, block_k = block_size[0], block_size[1] n_tiles = (N + block_n - 1) // block_n @@ -103,6 +107,9 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed): assert rel_diff < 0.001 +@pytest.mark.skipif( + not current_platform.is_cuda(), reason="CUTLASS only supported on CUDA platform." +) @torch.inference_mode() def test_w8a8_block_fp8_cutlass_matmul(): # Test simple case where weight.shape % 128 != 0, @@ -151,6 +158,10 @@ def test_w8a8_block_fp8_cutlass_matmul(): assert rel_diff < 0.001 +@pytest.mark.skipif( + current_platform.is_fp8_fnuz(), + reason="This platform supports e4m3fnuz, not e4m3fn.", +) @pytest.mark.parametrize( "M,N,K,block_size,out_dtype,seed", itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES, SEEDS), diff --git a/tests/kernels/quantization/test_cutlass_scaled_mm.py b/tests/kernels/quantization/test_cutlass_scaled_mm.py index de595b0a34e46..bc4744df7e69e 100644 --- a/tests/kernels/quantization/test_cutlass_scaled_mm.py +++ b/tests/kernels/quantization/test_cutlass_scaled_mm.py @@ -15,6 +15,9 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv +if not current_platform.is_cuda(): + pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True) + MNK_FACTORS = [ (1, 256, 128), (1, 16384, 1024), diff --git a/tests/kernels/quantization/test_cutlass_w4a8.py b/tests/kernels/quantization/test_cutlass_w4a8.py index 465e24fd7eb97..8cfc993fe8e82 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8.py +++ b/tests/kernels/quantization/test_cutlass_w4a8.py @@ -12,12 +12,18 @@ import torch from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.utils.quant_utils import ( + convert_packed_uint4b8_to_signed_int4_inplace, + pack_cols, pack_rows, quantize_weights, + unpack_quantized_values_into_int32, ) from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +if not current_platform.is_cuda(): + pytest.skip("These tests use CUTLASS which requires CUDA", allow_module_level=True) + # TODO: in future PR refactor this and `is_quant_method_supported` in the kernel # unit tests to a common utility function. Currently the use of # `is_quant_method_supported` conflates kernels with quantization methods @@ -167,8 +173,7 @@ def create_test_tensors( # for the practical use case we need per-tok scales for fp8 activations w_tok_s = torch.randn((m,), device="cuda", dtype=types.token_scale_type) - # weights are already per-group quantized, use placeholder here - w_ch_s = torch.ones((n,), device="cuda", dtype=types.channel_scale_type) + w_ch_s = torch.randn((n,), device="cuda", dtype=types.channel_scale_type) return Tensors( w_ref=w_ref, @@ -211,7 +216,7 @@ def mm_test_helper( print(output_ref) torch.testing.assert_close( - output, output_ref.to(output.dtype), rtol=1e-3, atol=1e-3 + output, output_ref.to(output.dtype), rtol=1e-2, atol=1e-2 ) @@ -257,7 +262,7 @@ def test_w4a8_cuda_graph(): ) w_tok_s = torch.randn((m,), device="cuda", dtype=torch.float32) - w_ch_s = torch.ones((n,), device="cuda", dtype=torch.float32) + w_ch_s = torch.randn((n,), device="cuda", dtype=torch.float32) # Construct a trivial model with a single layer that calls the kernel model = W4A8Layer( @@ -287,4 +292,38 @@ def test_w4a8_cuda_graph(): output.zero_() g.replay() - torch.testing.assert_close(output, output_ref, rtol=1e-3, atol=1e-3) + torch.testing.assert_close(output, output_ref, rtol=1e-2, atol=1e-2) + + +@pytest.mark.skipif( + not IS_SUPPORTED_BY_GPU, reason="CUTLASS W4A8 is not supported on this GPU type." +) +@pytest.mark.parametrize("shape", MNK_SHAPES) +def test_convert_packed_uint4b8_to_signed_int4_inplace(shape): + """ + The W4A16 checkpoints encode the weights as int4b8 packed to int32. + The CUTLASS kernels expect signed int4 packed to int32. + This tests checks that the runtime int4b8 -> signed int4 conversion + matches the offline conversion step exactly. + """ + _, N, K = shape + # random weights packed to int32 + t = torch.randint( + low=torch.iinfo(torch.int32).min, + high=torch.iinfo(torch.int32).max + 1, + size=(N, K // 8), + dtype=torch.int32, + device="cuda", + ) + + # compute reference + unpacked = unpack_quantized_values_into_int32( + t.clone(), scalar_types.uint4b8, packed_dim=1 + ) + unpacked = unpacked - 8 # int4b8 -> signed int4 + ref = pack_cols(unpacked & 0x0F, 4, *unpacked.shape) + + out = convert_packed_uint4b8_to_signed_int4_inplace(t.clone()) + + assert torch.equal(ref, out) + assert not torch.equal(ref, t) diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py new file mode 100644 index 0000000000000..3560402a29e90 --- /dev/null +++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py @@ -0,0 +1,340 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Tests for the CUTLASS-based W4A8 grouped GEMM kernel and the full MoE layer. +""" + +import random +from dataclasses import dataclass + +import pytest +import torch + +from vllm import _custom_ops as ops +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + pack_rows, + quantize_weights, +) +from vllm.platforms import current_platform +from vllm.scalar_type import ScalarType, scalar_types + +IS_SUPPORTED_BY_GPU = current_platform.get_device_capability()[0] >= 9 + + +def to_fp8(tensor: torch.Tensor) -> torch.Tensor: + finfo = torch.finfo(torch.float8_e4m3fn) + return tensor.clamp(min=finfo.min, max=finfo.max).to(dtype=torch.float8_e4m3fn) + + +def cutlass_quantize( + atype: torch.dtype, + w: torch.Tensor, + wtype: ScalarType, + stype: torch.dtype | None, + group_size: int | None, + zero_points: bool = False, +): + """ + Quantize weights into W4 and compute reference dequantized weights. + + Encoding/reordering of weights and packing of scales is deferred + until after all experts are combined. + """ + assert wtype.is_integer(), "TODO: support floating point weights" + + w_ref, w_q, w_s, w_zp = quantize_weights( + w, wtype, group_size=group_size, zero_points=zero_points + ) + + # Since scales are later cast to fp8, recompute w_ref in atype here. + w_ref = ( + w_q.to(torch.float32) + * w_s.to(atype).to(torch.float32).repeat_interleave(group_size, dim=0) + ).to(atype) + + # Bit mask prevents sign extension of int4 when packing. + w_q = pack_rows(w_q & 0x0F, wtype.size_bits, *w_q.shape) + # Make weights row-major (N, K). + w_q = w_q.t().contiguous() + + return w_ref, w_q, w_s.to(atype), w_zp + + +def cutlass_preprocess( + w_q_experts: list[torch.Tensor], w_s_experts: list[torch.Tensor] +): + """ + Reorder/encode expert weights and pack scales. + + Returns: + w_q_packed: Packed/encoded int4 weights for all experts. + w_s_packed: Packed fp8 scales for all experts. + packed_layout: Layout/stride metadata for grouped GEMM. + """ + w_s_packed = ops.cutlass_pack_scale_fp8(torch.stack(w_s_experts)) + w_q_packed, packed_layout = ops.cutlass_encode_and_reorder_int4b_grouped( + torch.stack(w_q_experts) + ) # expects dim 3 + return w_q_packed, w_s_packed, packed_layout + + +GROUP_SIZE = 128 +# (num_experts, N, K) +TEST_SHAPES = [ + (8, 512, 2048), + (8, 2048, 2048), + (64, 512, 1024), + (64, 2048, 2048), + (4, 2048, 768), + (8, 768, 2048), + (64, 1536, 2048), + (128, 8192, 4096), # test overflow int32 +] +ALIGNMENT = 16 # torch._scaled_mm alignment for M, needed for reference check + + +@dataclass +class MoETestSetup: + num_experts: int + K: int + N: int + Ms: list[int] + M_full: int + a: torch.Tensor + a_ref: torch.Tensor + a_strides: torch.Tensor + out: torch.Tensor + c_strides: torch.Tensor + per_tok_scales: torch.Tensor + per_chan_scales: torch.Tensor + w_refs: list[torch.Tensor] + w_q_packed: torch.Tensor + w_s_packed: torch.Tensor + problem_sizes: torch.Tensor + expert_offsets: torch.Tensor + b_strides: torch.Tensor + group_scale_strides: torch.Tensor + + +def make_moe_test_setup( + num_experts: int, + K: int, + N: int, + *, + alignment: int = ALIGNMENT, + max_blocks: int = 64, + device: str = "cuda", + random_zero: bool = False, +) -> MoETestSetup: + """Create a full set of tensors for testing cutlass_w4a8_moe_mm.""" + + assert K % GROUP_SIZE == 0 + # Token counts per expert (multiples of `alignment`). + Ms = [alignment * random.randint(1, max_blocks) for _ in range(num_experts)] + + # set random experts to 0 tokens + if random_zero and num_experts > 1: + num_zero = max(1, num_experts // 8) + zero_indices = random.sample(range(num_experts), k=num_zero) + for idx in zero_indices: + Ms[idx] = 0 + + M_full = sum(Ms) + assert M_full > 0 + + # Activations. + a = to_fp8(torch.randn((M_full, K), device=device)) + a_ref = a.to(torch.float32) + a_strides = torch.full((num_experts,), K, dtype=torch.int64, device=device) + + # Output buffer. + out = torch.empty((M_full, N), dtype=torch.bfloat16, device=device) + c_strides = torch.full((num_experts,), N, dtype=torch.int64, device=device) + + # Channel/token scales. + per_tok_scales = torch.randn((M_full, 1), dtype=torch.float32, device=device) + per_chan_scales = torch.randn( + (num_experts, N, 1), dtype=torch.float32, device=device + ) + + # Expert weights and scales. + wtype = scalar_types.int4 + atype = stype = torch.float8_e4m3fn + w_refs, w_qs, w_ss = [], [], [] + for _ in range(num_experts): + b = to_fp8(torch.randn((K, N), device=device)) + w_ref, w_q, w_s, _ = cutlass_quantize( + atype, b.to(torch.float16), wtype, stype, GROUP_SIZE, zero_points=False + ) + w_refs.append(w_ref) + w_qs.append(w_q) + w_ss.append(w_s) + + w_q_packed, w_s_packed, packed_layout = cutlass_preprocess(w_qs, w_ss) + + problem_sizes = torch.tensor( + [[N, M, K] for M in Ms], dtype=torch.int32, device=device + ) + + expert_offsets = torch.cat( + [ + torch.tensor([0], dtype=torch.int64), + torch.cumsum(torch.tensor(Ms, dtype=torch.int64), dim=0)[:-1], + ] + ).to(device=device) + + # B strides and group scale strides. + b_strides = packed_layout + group_scale_strides = torch.zeros( + (num_experts, 2), dtype=torch.int64, device=device + ) + group_scale_strides[:, 0] = N + + return MoETestSetup( + num_experts=num_experts, + K=K, + N=N, + Ms=Ms, + M_full=M_full, + a=a, + a_ref=a_ref, + a_strides=a_strides, + out=out, + c_strides=c_strides, + per_tok_scales=per_tok_scales, + per_chan_scales=per_chan_scales, + w_refs=w_refs, + w_q_packed=w_q_packed, + w_s_packed=w_s_packed, + problem_sizes=problem_sizes, + expert_offsets=expert_offsets, + b_strides=b_strides, + group_scale_strides=group_scale_strides, + ) + + +def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor: + """Compute reference output using torch._scaled_mm per expert.""" + out_ref = torch.empty_like(setup.out) + + ends = torch.cumsum(torch.tensor(setup.Ms), 0).tolist() + starts = setup.expert_offsets.cpu().tolist() + + for i in range(setup.num_experts): + start, end = starts[i], ends[i] + if start == end: + continue + + out_ref_i = torch._scaled_mm( + setup.a_ref[start:end].to(torch.float8_e4m3fn), + setup.w_refs[i].to(torch.float8_e4m3fn).t().contiguous().t(), + setup.per_tok_scales[start:end], # (M, 1) + setup.per_chan_scales[i].reshape(1, -1), # (1, N) + out_dtype=torch.bfloat16, + use_fast_accum=True, + ) + out_ref[start:end] = out_ref_i + + return out_ref + + +@pytest.mark.skipif( + not IS_SUPPORTED_BY_GPU, + reason="W4A8 Grouped GEMM is not supported on this GPU type.", +) +@pytest.mark.parametrize("shape", TEST_SHAPES) +@pytest.mark.parametrize("random_zero", [True, False]) +def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero): + num_experts, N, K = shape + current_platform.seed_everything(42) + setup = make_moe_test_setup( + num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero + ) + + ops.cutlass_w4a8_moe_mm( + setup.out, + setup.a, + setup.w_q_packed, + setup.per_tok_scales, + setup.per_chan_scales, + setup.w_s_packed, + GROUP_SIZE, + setup.expert_offsets, + setup.problem_sizes, + setup.a_strides, + setup.b_strides, + setup.c_strides, + setup.group_scale_strides, + ) + torch.cuda.synchronize() + + out_ref = compute_moe_reference_output(setup) + torch.testing.assert_close(setup.out, out_ref, rtol=1e-2, atol=1e-2) + + +class W4A8MoELayer(torch.nn.Module): + """ + Minimal wrapper module to test cuda graphs + """ + + def __init__(self, setup: MoETestSetup): + super().__init__() + self.setup = setup + + def forward(self, a: torch.Tensor) -> torch.Tensor: + s = self.setup + ops.cutlass_w4a8_moe_mm( + s.out, + a, + s.w_q_packed, + s.per_tok_scales, + s.per_chan_scales, + s.w_s_packed, + GROUP_SIZE, + s.expert_offsets, + s.problem_sizes, + s.a_strides, + s.b_strides, + s.c_strides, + s.group_scale_strides, + ) + return s.out + + +@pytest.mark.skipif( + not IS_SUPPORTED_BY_GPU, + reason="W4A8 Grouped GEMM is not supported on this GPU type.", +) +def test_cutlass_w4a8_moe_mm_cuda_graph(): + current_platform.seed_everything(42) + # Fixed config for CUDA graph test (single parameter point). + num_experts = 8 + K = 512 + N = 2048 + + setup = make_moe_test_setup( + num_experts=num_experts, + K=K, + N=N, + max_blocks=32, + ) + + # Construct model that calls the grouped GEMM kernel. + model = W4A8MoELayer(setup) + + # Build reference output once. + out_ref = compute_moe_reference_output(setup) + + # Capture and run the model in a CUDA graph. + a_static = setup.a.clone() # static input tensor for graph replay + + stream = torch.cuda.Stream() + with torch.cuda.stream(stream): + g = torch.cuda.CUDAGraph() + with torch.cuda.graph(g): + out_static = model(a_static) + + out_static.zero_() + g.replay() + + torch.testing.assert_close(out_static, out_ref, rtol=1e-2, atol=1e-2) diff --git a/tests/lora/test_moe_lora_align_sum.py b/tests/lora/test_moe_lora_align_sum.py index 72f1d759f1e7a..3a17f3eba6e8b 100644 --- a/tests/lora/test_moe_lora_align_sum.py +++ b/tests/lora/test_moe_lora_align_sum.py @@ -32,7 +32,7 @@ def sample_data(num_experts, max_loras, num_tokens, topk_num): @pytest.mark.parametrize("num_tokens", [100, 200, 1024, 4096]) # 81920 @pytest.mark.parametrize("topk_num", [6]) -@pytest.mark.parametrize("num_experts", [64, 128]) +@pytest.mark.parametrize("num_experts", [64, 128, 256, 512]) @pytest.mark.parametrize("max_loras", [2, 32]) @pytest.mark.parametrize("block_size", [16]) def test_moe_lora_align_block_size( diff --git a/tests/models/language/pooling/test_mm_classifier_conversion.py b/tests/models/language/pooling/test_mm_classifier_conversion.py index a31a771238e26..d50ee85b9fd2b 100644 --- a/tests/models/language/pooling/test_mm_classifier_conversion.py +++ b/tests/models/language/pooling/test_mm_classifier_conversion.py @@ -17,7 +17,6 @@ def test_idefics_multimodal( with vllm_runner( model_name="HuggingFaceM4/Idefics3-8B-Llama3", runner="pooling", - task="classify", convert="classify", load_format="dummy", max_model_len=512, @@ -86,7 +85,6 @@ def test_gemma_multimodal( with vllm_runner( model_name="google/gemma-3-4b-it", runner="pooling", - task="classify", convert="classify", load_format="auto", hf_overrides=update_config, diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index eca2b61e37d53..592862c2a0bb0 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -92,16 +92,19 @@ def run_test( *, tensor_parallel_size: int, distributed_executor_backend: str | None = None, + dtype: str = "half", ) -> None: prompt_list = PROMPTS * 10 expected_list = EXPECTED[model] * 10 with vllm_runner( model, - dtype="half", + dtype=dtype, max_model_len=448, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, + # TODO (NickLucche) figure out output differences with non-eager and re-enable + enforce_eager=True, ) as vllm_model: llm = vllm_model.llm @@ -120,12 +123,28 @@ def run_test( @pytest.mark.core_model @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) +@pytest.mark.parametrize("dtype", ["half"]) @create_new_process_for_each_test() -def test_models(vllm_runner, model) -> None: +def test_models(vllm_runner, model, dtype) -> None: run_test( vllm_runner, model, tensor_parallel_size=1, + dtype=dtype, + ) + + +@pytest.mark.cpu_model +@pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) +@pytest.mark.parametrize("dtype", ["half"]) +def test_models_cpu(vllm_runner, model, dtype) -> None: + # @create_new_process_for_each_test() does not work for some runners + # TODO: to fix cpu privilege issues in run-cpu-test-arm.sh + run_test( + vllm_runner, + model, + tensor_parallel_size=1, + dtype=dtype, ) diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index 639e290406fe2..636cd0ffd445e 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import asyncio import base64 import mimetypes import os @@ -186,6 +187,7 @@ async def test_fetch_image_error_conversion(): connector.fetch_image(broken_img) +@pytest.mark.flaky(reruns=3, reruns_delay=5) @pytest.mark.asyncio @pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) @pytest.mark.parametrize("num_frames", [-1, 32, 1800]) @@ -198,8 +200,12 @@ async def test_fetch_video_http(video_url: str, num_frames: int): } ) - video_sync, metadata_sync = connector.fetch_video(video_url) - video_async, metadata_async = await connector.fetch_video_async(video_url) + try: + video_sync, metadata_sync = connector.fetch_video(video_url) + video_async, metadata_async = await connector.fetch_video_async(video_url) + except (TimeoutError, asyncio.TimeoutError) as e: + pytest.skip(f"Timeout fetching video (CI network flakiness): {e}") + assert np.array_equal(video_sync, video_async) assert metadata_sync == metadata_async diff --git a/tests/multimodal/test_video.py b/tests/multimodal/test_video.py index 6ed21de368ac3..eccaa53ea1004 100644 --- a/tests/multimodal/test_video.py +++ b/tests/multimodal/test_video.py @@ -147,7 +147,7 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): """ Regression test for handling videos with broken frames. This test uses a pre-corrupted video file (assets/corrupted.mp4) that - contains broken/unreadable frames to verify the video loader handles + contains broken frames to verify the video loader handles them gracefully without crashing and returns accurate metadata. """ with monkeypatch.context() as m: @@ -177,3 +177,125 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch): f"Expected fewer than {metadata['total_num_frames']} frames, " f"but loaded {frames.shape[0]} frames" ) + + +@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1") +class TestVideoBackendOverride1(VideoLoader): + """Test loader that returns FAKE_OUTPUT_1 to verify backend selection.""" + + @classmethod + def load_bytes( + cls, data: bytes, num_frames: int = -1, **kwargs + ) -> tuple[npt.NDArray, dict]: + return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"} + + +@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2") +class TestVideoBackendOverride2(VideoLoader): + """Test loader that returns FAKE_OUTPUT_2 to verify backend selection.""" + + @classmethod + def load_bytes( + cls, data: bytes, num_frames: int = -1, **kwargs + ) -> tuple[npt.NDArray, dict]: + return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"} + + +def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch): + """ + Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND + environment variable. + + This allows users to dynamically select a different video backend + via --media-io-kwargs without changing the global env var, which is + useful when plugins set a default backend but a specific request + needs a different one. + """ + with monkeypatch.context() as m: + # Set the env var to one backend + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1") + + imageio = ImageMediaIO() + + # Without video_backend kwarg, should use env var backend + videoio_default = VideoMediaIO(imageio, num_frames=10) + frames_default, metadata_default = videoio_default.load_bytes(b"test") + np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1) + assert metadata_default["video_backend"] == "test_video_backend_override_1" + + # With video_backend kwarg, should override env var + videoio_override = VideoMediaIO( + imageio, num_frames=10, video_backend="test_video_backend_override_2" + ) + frames_override, metadata_override = videoio_override.load_bytes(b"test") + np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2) + assert metadata_override["video_backend"] == "test_video_backend_override_2" + + +def test_video_media_io_backend_kwarg_not_passed_to_loader( + monkeypatch: pytest.MonkeyPatch, +): + """ + Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed + through to the underlying video loader's load_bytes method. + + This ensures the kwarg is properly popped from kwargs before forwarding. + """ + + @VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg") + class RejectVideoBackendKwargLoader(VideoLoader): + """Test loader that fails if video_backend is passed through.""" + + @classmethod + def load_bytes( + cls, data: bytes, num_frames: int = -1, **kwargs + ) -> tuple[npt.NDArray, dict]: + # This should never receive video_backend in kwargs + if "video_backend" in kwargs: + raise AssertionError( + "video_backend should be consumed by VideoMediaIO, " + "not passed to loader" + ) + return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())} + + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg") + + imageio = ImageMediaIO() + + # Even when video_backend is provided, it should NOT be passed to loader + videoio = VideoMediaIO( + imageio, + num_frames=10, + video_backend="test_reject_video_backend_kwarg", + other_kwarg="should_pass_through", + ) + + # This should NOT raise AssertionError + frames, metadata = videoio.load_bytes(b"test") + np.testing.assert_array_equal(frames, FAKE_OUTPUT_1) + # Verify other kwargs are still passed through + assert "other_kwarg" in metadata["received_kwargs"] + + +def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch): + """ + Test that when video_backend kwarg is None or not provided, + VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var. + """ + with monkeypatch.context() as m: + m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2") + + imageio = ImageMediaIO() + + # Explicit None should fall back to env var + videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None) + frames_none, metadata_none = videoio_none.load_bytes(b"test") + np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2) + assert metadata_none["video_backend"] == "test_video_backend_override_2" + + # Not providing video_backend should also fall back to env var + videoio_missing = VideoMediaIO(imageio, num_frames=10) + frames_missing, metadata_missing = videoio_missing.load_bytes(b"test") + np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2) + assert metadata_missing["video_backend"] == "test_video_backend_override_2" diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py index 7bcac9ad768e7..62203186510ce 100644 --- a/tests/quantization/test_fp8.py +++ b/tests/quantization/test_fp8.py @@ -10,10 +10,14 @@ import torch from tests.quantization.utils import is_quant_method_supported from vllm import _custom_ops as ops +from vllm.model_executor.layers.fused_moe import FusedMoE from vllm.model_executor.layers.quantization.fp8 import ( + Fp8Config, Fp8KVCacheMethod, Fp8LinearMethod, + Fp8MoEMethod, ) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.platforms import current_platform MODELS = [ @@ -261,3 +265,87 @@ def test_scaled_fp8_quant(dtype) -> None: torch.narrow(y_nc_pad, 0, 0, x_nc.shape[0]), inv_scale_nc, dtype ), ) + + +@pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod]) +# FP8 weight reloading does not support online quantization +@pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True]) # skip False +@pytest.mark.parametrize("weight_block_size", [None, [1, 1]]) +# any postprocessing that is applied to the weights such as padding and repacking +# (excluding device sharding) must also be applied to the reloaded weights +# +# this is the case for marlin as well as per-tensor Fp8MoEMethod +@pytest.mark.parametrize("use_marlin", [False]) # skip True +def test_fp8_reloading( + method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init +): + if is_checkpoint_fp8_serialized is False: + pytest.skip("FP8 weight reloading does not support online quantization") + + if method_cls is Fp8MoEMethod and weight_block_size is None: + pytest.skip( + "FP8 Tensor weight reloading does not support fusing w13_weight_scale. " + "If this is your use case, consider using a restore function like #26327" + ) + + with torch.device("cuda:0"): + config = Fp8Config( + is_checkpoint_fp8_serialized=is_checkpoint_fp8_serialized, + weight_block_size=weight_block_size, + ) + + if method_cls is Fp8LinearMethod: + layer = torch.nn.Linear(1, 1) + method = method_cls(config) + method.create_weights( + layer=layer, + input_size_per_partition=1, + output_partition_sizes=[1], + input_size=1, + output_size=1, + params_dtype=torch.bfloat16, + weight_loader=default_weight_loader, + ) + + else: + layer = FusedMoE( + num_experts=1, + top_k=1, + hidden_size=1, + intermediate_size=1, + ) + method = method_cls(config, layer) + method.create_weights( + layer=layer, + num_experts=1, + hidden_size=1, + intermediate_size_per_partition=1, + params_dtype=torch.bfloat16, + weight_loader=default_weight_loader, + ) + + method.use_marlin = use_marlin + + # capture weights format during loading + original_metadata = [ + (name, param.shape, getattr(param, "weight_loader", default_weight_loader)) + for name, param in layer.named_parameters() + ] + + # test loading + for name, shape, _ in original_metadata: + param = getattr(layer, name) + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, torch.zeros(shape)) # cannot use empty + + method.process_weights_after_loading(layer) + + # test reloading works after loading + # assuming that no reshaping occurred + for name, shape, original_weight_loader in original_metadata: + param = getattr(layer, name) + weight_loader = getattr(param, "weight_loader", default_weight_loader) + assert weight_loader is original_weight_loader + weight_loader(param, torch.zeros(shape)) # cannot use empty + + method.process_weights_after_loading(layer) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 334f9a65e4c03..0ff6e8407ce67 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -212,11 +212,11 @@ def test_ocp_mx_wikitext_correctness(config: AccuracyTestConfig, tp_size: int): task = "wikitext" rtol = 0.1 - # Smaller cuda_graph_sizes to speed up the test. + # Smaller cudagraph_capture_sizes to speed up the test. results = lm_eval.simple_evaluate( model="vllm", model_args=config.get_model_args( - tp_size=tp_size, kwargs={"cuda_graph_sizes": [16]} + tp_size=tp_size, kwargs={"cudagraph_capture_sizes": [16]} ), tasks=task, batch_size=64, diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py index 34e9483de54b3..165e91a2c79f2 100644 --- a/tests/reasoning/test_base_thinking_reasoning_parser.py +++ b/tests/reasoning/test_base_thinking_reasoning_parser.py @@ -132,6 +132,41 @@ class TestBaseThinkingReasoningParserMethods: is False ) + def test_is_reasoning_end_streaming(self, test_tokenizer): + """Test the is_reasoning_end_streaming method.""" + parser = TestThinkingReasoningParser(test_tokenizer) + end_token_id = parser.end_token_id + start_token_id = parser.start_token_id + + assert ( + parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id]) + is True + ) + assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False + assert parser.is_reasoning_end_streaming([], []) is False + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id], [end_token_id] + ) + is True + ) + assert ( + parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False + ) + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id, 2, start_token_id, 2], + [2], + ) + is False + ) + assert ( + parser.is_reasoning_end_streaming( + [1, start_token_id, 2, end_token_id, 2, 2], [2] + ) + is False + ) + def test_extract_content_ids(self, test_tokenizer): """Test the extract_content_ids method.""" parser = TestThinkingReasoningParser(test_tokenizer) diff --git a/tests/reasoning/test_deepseekv3_reasoning_parser.py b/tests/reasoning/test_deepseekv3_reasoning_parser.py index 6e8f0e8dcc9b9..874fdef778110 100644 --- a/tests/reasoning/test_deepseekv3_reasoning_parser.py +++ b/tests/reasoning/test_deepseekv3_reasoning_parser.py @@ -40,6 +40,7 @@ def test_identity_reasoning_parser_basic(tokenizer): input_tokens = tokenizer.tokenize(input_text) input_ids = tokenizer.convert_tokens_to_ids(input_tokens) assert parser.is_reasoning_end(input_ids) is True + assert parser.is_reasoning_end_streaming(input_ids, input_ids) is True # Test extract_content_ids returns all input_ids assert parser.extract_content_ids(input_ids) == input_ids diff --git a/tests/reasoning/test_minimax_m2_append_reasoning_parser.py b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py new file mode 100644 index 0000000000000..eefe5e3eff74c --- /dev/null +++ b/tests/reasoning/test_minimax_m2_append_reasoning_parser.py @@ -0,0 +1,195 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +parser_name = "minimax_m2_append_think" +end_token = "" + +# MiniMax M2 model path +REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2" + + +@pytest.fixture(scope="module") +def minimax_m2_tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +# ============================================================================= +# MiniMaxM2AppendThinkReasoningParser behavior: +# - Prepends to the beginning of the output +# - Does NOT separate reasoning and content +# - Returns everything as content (with prepended) +# - reasoning is always None +# +# This parser is used when you want to keep the raw output with added +# ============================================================================= + +# Case: simple output with end token +SIMPLE_OUTPUT = { + "output": "This is reasoningThis is response", + "reasoning": None, + "content": "This is reasoningThis is response", + "is_reasoning_end": True, +} + +# Case: output without end token (reasoning in progress) +NO_END_TOKEN = { + "output": "This is reasoning in progress", + "reasoning": None, + "content": "This is reasoning in progress", + "is_reasoning_end": False, +} + +# Case: only end token +ONLY_END_TOKEN = { + "output": "This is response", + "reasoning": None, + "content": "This is response", + "is_reasoning_end": True, +} + +# Case: multiple lines +MULTIPLE_LINES = { + "output": "Line 1\nLine 2Response 1\nResponse 2", + "reasoning": None, + "content": "Line 1\nLine 2Response 1\nResponse 2", + "is_reasoning_end": True, +} + +# Case: empty output (non-streaming prepends ) +EMPTY = { + "output": "", + "reasoning": None, + "content": "", + "is_reasoning_end": False, +} + +# Case: empty output streaming (no tokens = no output) +EMPTY_STREAMING = { + "output": "", + "reasoning": None, + "content": None, + "is_reasoning_end": False, +} + +# Case: special characters +SPECIAL_CHARS = { + "output": "Let me think... 1+1=2Yes!", + "reasoning": None, + "content": "Let me think... 1+1=2Yes!", + "is_reasoning_end": True, +} + +# Case: code in output +CODE_OUTPUT = { + "output": "```python\nprint('hi')\n```Here's the code.", + "reasoning": None, + "content": "```python\nprint('hi')\n```Here's the code.", + "is_reasoning_end": True, +} + +TEST_CASES = [ + pytest.param( + False, + SIMPLE_OUTPUT, + id="simple_output", + ), + pytest.param( + True, + SIMPLE_OUTPUT, + id="simple_output_streaming", + ), + pytest.param( + False, + NO_END_TOKEN, + id="no_end_token", + ), + pytest.param( + True, + NO_END_TOKEN, + id="no_end_token_streaming", + ), + pytest.param( + False, + ONLY_END_TOKEN, + id="only_end_token", + ), + pytest.param( + True, + ONLY_END_TOKEN, + id="only_end_token_streaming", + ), + pytest.param( + False, + MULTIPLE_LINES, + id="multiple_lines", + ), + pytest.param( + True, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + False, + EMPTY, + id="empty", + ), + pytest.param( + True, + EMPTY_STREAMING, + id="empty_streaming", + ), + pytest.param( + False, + SPECIAL_CHARS, + id="special_chars", + ), + pytest.param( + True, + SPECIAL_CHARS, + id="special_chars_streaming", + ), + pytest.param( + False, + CODE_OUTPUT, + id="code_output", + ), + pytest.param( + True, + CODE_OUTPUT, + id="code_output_streaming", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, + minimax_m2_tokenizer, +): + output = minimax_m2_tokenizer.tokenize(param_dict["output"]) + # decode everything to tokens + output_tokens: list[str] = [ + minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)( + minimax_m2_tokenizer + ) + + reasoning, content = run_reasoning_extraction( + parser, output_tokens, streaming=streaming + ) + + assert reasoning == param_dict["reasoning"] + assert content == param_dict["content"] + + # Test is_reasoning_end + output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output) + is_reasoning_end = parser.is_reasoning_end(output_ids) + assert is_reasoning_end == param_dict["is_reasoning_end"] diff --git a/tests/reasoning/test_minimax_m2_reasoning_parser.py b/tests/reasoning/test_minimax_m2_reasoning_parser.py new file mode 100644 index 0000000000000..0d1056894c6ae --- /dev/null +++ b/tests/reasoning/test_minimax_m2_reasoning_parser.py @@ -0,0 +1,230 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +from transformers import AutoTokenizer + +from tests.reasoning.utils import run_reasoning_extraction +from vllm.reasoning import ReasoningParser, ReasoningParserManager + +parser_name = "minimax_m2" +end_token = "" + +# MiniMax M2 model path +REASONING_MODEL_NAME = "MiniMaxAI/MiniMax-M2" + + +@pytest.fixture(scope="module") +def minimax_m2_tokenizer(): + return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME) + + +# ============================================================================= +# MiniMax M2 specific behavior: +# - Model does NOT generate start token +# - Model only generates end token +# - All content before is reasoning +# - All content after is the actual response (content) +# ============================================================================= + +# Case: reasoning + end token + content (typical case) +SIMPLE_REASONING = { + "output": "This is a reasoning sectionThis is the rest", + "reasoning": "This is a reasoning section", + "content": "This is the rest", + "is_reasoning_end": True, +} + +# Case: reasoning + end token only (no content after) +COMPLETE_REASONING = { + "output": "This is a reasoning section", + "reasoning": "This is a reasoning section", + "content": None, + "is_reasoning_end": True, +} + +# Case: no end token yet (streaming in progress, all is reasoning) +NO_END_TOKEN = { + "output": "This is reasoning in progress", + "reasoning": "This is reasoning in progress", + "content": None, + "is_reasoning_end": False, +} + +# Case: multiple lines of reasoning +MULTIPLE_LINES = { + "output": "First line\nSecond lineResponse first line\nResponse second", + "reasoning": "First line\nSecond line", + "content": "Response first line\nResponse second", + "is_reasoning_end": True, +} + +# Case: only end token (empty reasoning, immediate response) +SHORTEST_REASONING_NO_STREAMING = { + "output": "This is the response", + "reasoning": "", + "content": "This is the response", + "is_reasoning_end": True, +} + +# Case: only end token streaming (reasoning is None because it's just the token) +SHORTEST_REASONING_STREAMING = { + "output": "This is the response", + "reasoning": None, + "content": "This is the response", + "is_reasoning_end": True, +} + +# Case: empty output +EMPTY = { + "output": "", + "reasoning": "", + "content": None, + "is_reasoning_end": False, +} + +# Case: empty streaming +EMPTY_STREAMING = { + "output": "", + "reasoning": None, + "content": None, + "is_reasoning_end": False, +} + +# Case: long reasoning with special characters +SPECIAL_CHARS = { + "output": "Let me think... 1+1=2, right?Yes, 1+1=2.", + "reasoning": "Let me think... 1+1=2, right?", + "content": "Yes, 1+1=2.", + "is_reasoning_end": True, +} + +# Case: reasoning with code blocks +CODE_IN_REASONING = { + "output": "```python\nprint('hello')\n```Here is the code.", + "reasoning": "```python\nprint('hello')\n```", + "content": "Here is the code.", + "is_reasoning_end": True, +} + +TEST_CASES = [ + # Core cases: no start token (MiniMax M2 actual behavior) + pytest.param( + False, + SIMPLE_REASONING, + id="simple_reasoning", + ), + pytest.param( + True, + SIMPLE_REASONING, + id="simple_reasoning_streaming", + ), + pytest.param( + False, + COMPLETE_REASONING, + id="complete_reasoning", + ), + pytest.param( + True, + COMPLETE_REASONING, + id="complete_reasoning_streaming", + ), + pytest.param( + False, + NO_END_TOKEN, + id="no_end_token", + ), + pytest.param( + True, + NO_END_TOKEN, + id="no_end_token_streaming", + ), + pytest.param( + False, + MULTIPLE_LINES, + id="multiple_lines", + ), + pytest.param( + True, + MULTIPLE_LINES, + id="multiple_lines_streaming", + ), + pytest.param( + False, + SHORTEST_REASONING_NO_STREAMING, + id="shortest_reasoning", + ), + pytest.param( + True, + SHORTEST_REASONING_STREAMING, + id="shortest_reasoning_streaming", + ), + pytest.param( + False, + EMPTY, + id="empty", + ), + pytest.param( + True, + EMPTY_STREAMING, + id="empty_streaming", + ), + pytest.param( + False, + SPECIAL_CHARS, + id="special_chars", + ), + pytest.param( + True, + SPECIAL_CHARS, + id="special_chars_streaming", + ), + pytest.param( + False, + CODE_IN_REASONING, + id="code_in_reasoning", + ), + pytest.param( + True, + CODE_IN_REASONING, + id="code_in_reasoning_streaming", + ), +] + + +@pytest.mark.parametrize("streaming, param_dict", TEST_CASES) +def test_reasoning( + streaming: bool, + param_dict: dict, + minimax_m2_tokenizer, +): + output = minimax_m2_tokenizer.tokenize(param_dict["output"]) + # decode everything to tokens + output_tokens: list[str] = [ + minimax_m2_tokenizer.convert_tokens_to_string([token]) for token in output + ] + parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(parser_name)( + minimax_m2_tokenizer + ) + + reasoning, content = run_reasoning_extraction( + parser, output_tokens, streaming=streaming + ) + + assert reasoning == param_dict["reasoning"] + assert content == param_dict["content"] + + # Test is_reasoning_end + output_ids = minimax_m2_tokenizer.convert_tokens_to_ids(output) + is_reasoning_end = parser.is_reasoning_end(output_ids) + assert is_reasoning_end == param_dict["is_reasoning_end"] + + # Test extract_content + if param_dict["content"] is not None: + content = parser.extract_content_ids(output_ids) + assert content == minimax_m2_tokenizer.convert_tokens_to_ids( + minimax_m2_tokenizer.tokenize(param_dict["content"]) + ) + else: + content = parser.extract_content_ids(output) + assert content == [] diff --git a/tests/reasoning/test_mistral_reasoning_parser.py b/tests/reasoning/test_mistral_reasoning_parser.py index 0fe315c2567f9..01592fd0782a9 100644 --- a/tests/reasoning/test_mistral_reasoning_parser.py +++ b/tests/reasoning/test_mistral_reasoning_parser.py @@ -18,47 +18,53 @@ def mistral_tokenizer(): return mistral_tokenizer -SIMPLE_REASONING = { +INVALID_SIMPLE_REASONING = { "output": "This is a reasoning section[/THINK]This is the rest", - "reasoning": "This is a reasoning section", - "content": "This is the rest", - "is_reasoning_end": True, + "reasoning": None, + "content": "This is a reasoning sectionThis is the rest", + "is_reasoning_end": False, } -COMPLETE_REASONING = { +INVALID_COMPLETE_REASONING = { "output": "This is a reasoning section[/THINK]", - "reasoning": "This is a reasoning section", - "content": None, - "is_reasoning_end": True, + "reasoning": None, + "content": "This is a reasoning section", + "is_reasoning_end": False, } NO_CONTENT = { - "output": "This is content", - "reasoning": "This is content", + "output": "[THINK]This is reasoning", + "reasoning": "This is reasoning", "content": None, "is_reasoning_end": False, } +NO_REASONING = { + "output": "This is content", + "reasoning": None, + "content": "This is content", + "is_reasoning_end": False, +} NO_REASONING_STREAMING = { "output": "This is a reasoning section", - "reasoning": "This is a reasoning section", - "content": None, + "reasoning": None, + "content": "This is a reasoning section", "is_reasoning_end": False, } -MULTIPLE_LINES = { +INVALID_MULTIPLE_LINES = { "output": "This\nThat[/THINK]This is the rest\nThat", - "reasoning": "This\nThat", - "content": "This is the rest\nThat", - "is_reasoning_end": True, + "reasoning": None, + "content": "This\nThatThis is the rest\nThat", + "is_reasoning_end": False, } -SHORTEST_REASONING_NO_STREAMING = { - "output": "[/THINK]This is the rest", - "reasoning": "", - "content": "This is the rest", - "is_reasoning_end": True, -} -SHORTEST_REASONING = { +INVALID_SHORTEST_REASONING_NO_STREAMING = { "output": "[/THINK]This is the rest", "reasoning": None, "content": "This is the rest", - "is_reasoning_end": True, + "is_reasoning_end": False, +} +INVALID_SHORTEST_REASONING = { + "output": "[/THINK]This is the rest", + "reasoning": None, + "content": "This is the rest", + "is_reasoning_end": False, } REASONING_WITH_THINK = { "output": "[THINK]This is a reasoning section[/THINK]This is the rest", @@ -78,17 +84,17 @@ MULTIPLE_LINES_WITH_THINK = { "content": "This is the rest\nThat", "is_reasoning_end": True, } -SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { - "output": "[/THINK]This is the rest", - "reasoning": "", - "content": "This is the rest", - "is_reasoning_end": True, -} -SHORTEST_REASONING_WITH_THINK = { +INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK = { "output": "[/THINK]This is the rest", "reasoning": None, "content": "This is the rest", - "is_reasoning_end": True, + "is_reasoning_end": False, +} +INVALID_SHORTEST_REASONING_WITH_THINK = { + "output": "[/THINK]This is the rest", + "reasoning": None, + "content": "This is the rest", + "is_reasoning_end": False, } THINK_NO_END = { "output": "[THINK]This is a reasoning section", @@ -98,8 +104,8 @@ THINK_NO_END = { } EMPTY = { "output": "", - "reasoning": "", - "content": None, + "reasoning": None, + "content": "", "is_reasoning_end": False, } EMPTY_STREAMING = { @@ -109,47 +115,48 @@ EMPTY_STREAMING = { "is_reasoning_end": False, } NEW_LINE = { - "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", + "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", "reasoning": "This is a reasoning section", - "content": "\nThis is the rest", + "content": "Before\n\nThis is the rest", "is_reasoning_end": True, } -# Streaming cannot handle new lines at the beginning of the output -# because we need to support [THINK]...[/THINK] and [/THINK]... -# We cannot know if the text before [THINK] is reasoning content -# or not. NEW_LINE_STREAMING = { - "output": "\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", - "reasoning": "\nThis is a reasoning section", - "content": "\nThis is the rest", + "output": "Before\n[THINK]This is a reasoning section[/THINK]\nThis is the rest", + "reasoning": "This is a reasoning section", + "content": "Before\n\nThis is the rest", "is_reasoning_end": True, } TEST_CASES = [ pytest.param( False, - SIMPLE_REASONING, - id="simple_reasoning", + INVALID_SIMPLE_REASONING, + id="invalid_simple_reasoning", ), pytest.param( True, - SIMPLE_REASONING, - id="simple_reasoning_streaming", + INVALID_SIMPLE_REASONING, + id="invalid_simple_reasoning_streaming", ), pytest.param( False, - COMPLETE_REASONING, - id="complete_reasoning", + INVALID_COMPLETE_REASONING, + id="invalid_complete_reasoning", ), pytest.param( True, - COMPLETE_REASONING, - id="complete_reasoning_streaming", + INVALID_COMPLETE_REASONING, + id="invalid_complete_reasoning_streaming", ), pytest.param( False, NO_CONTENT, - id="no_content_token", + id="no_content", + ), + pytest.param( + False, + NO_REASONING, + id="no_reasoning", ), pytest.param( True, @@ -158,23 +165,23 @@ TEST_CASES = [ ), pytest.param( False, - MULTIPLE_LINES, - id="multiple_lines", + INVALID_MULTIPLE_LINES, + id="invalid_multiple_lines", ), pytest.param( True, - MULTIPLE_LINES, - id="multiple_lines_streaming", + INVALID_MULTIPLE_LINES, + id="invalid_multiple_lines_streaming", ), pytest.param( True, - SHORTEST_REASONING, - id="shortest", + INVALID_SHORTEST_REASONING, + id="invalid_shortest", ), pytest.param( False, - SHORTEST_REASONING_NO_STREAMING, - id="shortest_streaming", + INVALID_SHORTEST_REASONING_NO_STREAMING, + id="invalid_shortest_streaming", ), pytest.param( False, @@ -208,13 +215,13 @@ TEST_CASES = [ ), pytest.param( False, - SHORTEST_REASONING_NO_STREAMING_WITH_THINK, - id="shortest_with_think", + INVALID_SHORTEST_REASONING_NO_STREAMING_WITH_THINK, + id="invalid_shortest_with_think", ), pytest.param( True, - SHORTEST_REASONING_WITH_THINK, - id="shortest_with_think_streaming", + INVALID_SHORTEST_REASONING_WITH_THINK, + id="invalid_shortest_with_think_streaming", ), pytest.param( False, @@ -316,10 +323,26 @@ def test_mistral_reasoning( # Test extract_content if param_dict["content"] is not None: - content = parser.extract_content_ids(output_tokens) - assert content == mistral_tokenizer.tokenizer.encode( - param_dict["content"], bos=False, eos=False + # Handle the case where there are tokens outputted before Thinking. + # This should not occur if the model is well trained and prompted. + if "[THINK]" in param_dict["output"] and not param_dict["output"].startswith( + "[THINK]" + ): + before_content = param_dict["output"].split("[THINK]")[0] + before_token_ids = mistral_tokenizer.tokenizer.encode( + before_content, bos=False, eos=False + ) + left_to_encode = param_dict["content"][len(before_content) :] + # Normal situation. + else: + before_token_ids = [] + left_to_encode = param_dict["content"] + + content_tokens = parser.extract_content_ids(output_tokens) + expected_token_ids = before_token_ids + mistral_tokenizer.tokenizer.encode( + left_to_encode, bos=False, eos=False ) + assert content_tokens == expected_token_ids else: content = parser.extract_content_ids(output_tokens) assert content == [] diff --git a/tests/standalone_tests/python_only_compile.sh b/tests/standalone_tests/python_only_compile.sh index d29b9afcc6fbf..2017e34030d60 100644 --- a/tests/standalone_tests/python_only_compile.sh +++ b/tests/standalone_tests/python_only_compile.sh @@ -3,12 +3,45 @@ # for users who do not have any compilers installed on their system set -e -set -x merge_base_commit=$(git merge-base HEAD origin/main) -echo "Current merge base commit with main: $merge_base_commit" +echo "INFO: current merge base commit with main: $merge_base_commit" git show --oneline -s $merge_base_commit +# test whether the metadata.json url is valid, retry each 3 minutes up to 5 times +# this avoids cumbersome error messages & manual retries in case the precompiled wheel +# for the given commit is still being built in the release pipeline +meta_json_url="https://wheels.vllm.ai/$merge_base_commit/vllm/metadata.json" +echo "INFO: will use metadata.json from $meta_json_url" + +for i in {1..5}; do + echo "Checking metadata.json URL (attempt $i)..." + if curl --fail "$meta_json_url" > metadata.json; then + echo "INFO: metadata.json URL is valid." + # check whether it is valid json by python + if python3 -m json.tool metadata.json; then + echo "INFO: metadata.json is valid JSON. Proceeding with the test." + else + echo "CRITICAL: metadata.json exists but is not valid JSON, please do report in #sig-ci channel!" + exit 1 + fi + break + fi + # failure handling + if [ $i -eq 5 ]; then + echo "ERROR: metadata.json URL is still not valid after 5 attempts." + echo "ERROR: Please check whether the precompiled wheel for commit $merge_base_commit exists." + echo " NOTE: If $merge_base_commit is a new commit on main, maybe try again after its release pipeline finishes." + echo " NOTE: If it fails, please report in #sig-ci channel." + exit 1 + else + echo "WARNING: metadata.json URL is not valid. Retrying in 3 minutes..." + sleep 180 + fi +done + +set -x + cd /vllm-workspace/ # uninstall vllm @@ -29,6 +62,6 @@ python3 -c 'import vllm' # Check if the clangd log file was created if [ ! -f /tmp/changed.file ]; then - echo "changed.file was not created, python only compilation failed" + echo "ERROR: changed.file was not created, python only compilation failed" exit 1 fi diff --git a/tests/test_config.py b/tests/test_config.py index 77d3a7115978e..ee706ab3d9c87 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -89,64 +89,6 @@ def test_update_config(): new_config3 = update_config(config3, {"a": "new_value"}) -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "generate", "none", "generate"), - ("intfloat/multilingual-e5-small", "pooling", "none", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "none", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "none", "embed"), - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_auto_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="auto") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("distilbert/distilgpt2", "pooling", "embed", "embed"), - ("intfloat/multilingual-e5-small", "pooling", "embed", "embed"), - ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify", "classify"), - ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify", "classify"), - ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed", "embed"), - ("openai/whisper-small", "pooling", "embed", "embed"), - ], -) -def test_score_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="score") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - -# Can remove once --task option is fully deprecated -@pytest.mark.parametrize( - ("model_id", "expected_runner_type", "expected_convert_type", "expected_task"), - [ - ("openai/whisper-small", "generate", "none", "transcription"), - ], -) -def test_transcription_task( - model_id, expected_runner_type, expected_convert_type, expected_task -): - config = ModelConfig(model_id, task="transcription") - - assert config.runner_type == expected_runner_type - assert config.convert_type == expected_convert_type - - @pytest.mark.parametrize( ("model_id", "expected_runner_type", "expected_convert_type"), [ @@ -1085,7 +1027,7 @@ def test_vllm_config_explicit_overrides(): ) # Override one field but not others - pass_config = PassConfig(enable_noop=False) + pass_config = PassConfig(eliminate_noops=False) compilation_config = CompilationConfig(pass_config=pass_config) config = VllmConfig( model_config=regular_model, diff --git a/tests/test_envs.py b/tests/test_envs.py index 11bbec38202bf..b6b7cf38d4abc 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -8,6 +8,7 @@ import pytest import vllm.envs as envs from vllm.envs import ( + disable_envs_cache, enable_envs_cache, env_list_with_choices, env_set_with_choices, @@ -57,6 +58,43 @@ def test_getattr_with_cache(monkeypatch: pytest.MonkeyPatch): envs.__getattr__ = envs.__getattr__.__wrapped__ +def test_getattr_with_reset(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("VLLM_HOST_IP", "1.1.1.1") + # __getattr__ is not decorated with functools.cache + assert not hasattr(envs.__getattr__, "cache_info") + + # Enable envs cache and ignore ongoing environment changes + enable_envs_cache() + assert envs.VLLM_HOST_IP == "1.1.1.1" + # With cache enabled, the environment variable value is cached and unchanged + monkeypatch.setenv("VLLM_HOST_IP", "2.2.2.2") + assert envs.VLLM_HOST_IP == "1.1.1.1" + + disable_envs_cache() + assert envs.VLLM_HOST_IP == "2.2.2.2" + # After cache disabled, the environment variable value would be synced + # with os.environ + monkeypatch.setenv("VLLM_HOST_IP", "3.3.3.3") + assert envs.VLLM_HOST_IP == "3.3.3.3" + + +def test_is_envs_cache_enabled() -> None: + assert not envs._is_envs_cache_enabled() + enable_envs_cache() + assert envs._is_envs_cache_enabled() + + # Only wrap one-layer of cache, so we only need to + # call disable once to reset. + enable_envs_cache() + enable_envs_cache() + enable_envs_cache() + disable_envs_cache() + assert not envs._is_envs_cache_enabled() + + disable_envs_cache() + assert not envs._is_envs_cache_enabled() + + class TestEnvWithChoices: """Test cases for env_with_choices function.""" diff --git a/tests/tool_use/test_mistral_tool_parser.py b/tests/tool_use/test_mistral_tool_parser.py index e5deb7f40eb35..2dd0399cb8eeb 100644 --- a/tests/tool_use/test_mistral_tool_parser.py +++ b/tests/tool_use/test_mistral_tool_parser.py @@ -615,6 +615,7 @@ def test_extract_tool_calls_streaming( "single_tool_weather", "multiple_tool_calls", "content_before_tool", + "complex", ], argnames=["model_output", "expected_tool_calls", "expected_content"], argvalues=[ @@ -673,6 +674,21 @@ def test_extract_tool_calls_streaming( ], "bla", ), + ( + # Complex + """[TOOL_CALLS]bash{"command": "print(\\"hello world!\\")\\nre.compile(r\'{}\')"}""", # noqa: E501 + [ + ToolCall( + function=FunctionCall( + name="bash", + arguments=json.dumps( + {"command": "print(\"hello world!\")\nre.compile(r'{}')"} + ), + ) + ) + ], + "", + ), ], ) def test_extract_tool_calls_streaming_one_chunk( diff --git a/tests/utils.py b/tests/utils.py index ea3675b1461b8..d8102331b3612 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -119,7 +119,7 @@ class RemoteOpenAIServer: vllm_serve_args: list[str], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, override_hf_configs: dict[str, Any] | None = None, @@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer): child_process_fxn: Callable[[dict[str, str] | None, str, list[str]], None], *, env_dict: dict[str, str] | None = None, - seed: int | None = 0, + seed: int = 0, auto_port: bool = True, max_wait_seconds: float | None = None, ) -> None: diff --git a/tests/v1/attention/test_attention_splitting.py b/tests/v1/attention/test_attention_splitting.py index f60861e3489d6..f08e2f480e30f 100644 --- a/tests/v1/attention/test_attention_splitting.py +++ b/tests/v1/attention/test_attention_splitting.py @@ -154,7 +154,10 @@ def test_split_attn_metadata_decode_batch(large_decode_metadata): def apply_split_decodes_and_prefills( - query_lens: list[int], decode_threshold: int, require_uniform: bool + query_lens: list[int], + decode_threshold: int, + require_uniform: bool, + padded_num_tokens: int | None = None, ): """Helper function to apply split_decodes_and_prefills and return the results.""" @@ -165,6 +168,10 @@ def apply_split_decodes_and_prefills( block_size=16, device=device, ) + + if padded_num_tokens is not None: + common_metadata.num_actual_tokens = padded_num_tokens + return split_decodes_and_prefills( common_metadata, decode_threshold=decode_threshold, @@ -271,6 +278,22 @@ def test_split_decodes_and_prefills_uniform_mixed_batch_non_uniform_decodes(): assert num_prefill_tokens == (sum(query_lens) - 2) # rest of the tokens +def test_split_decodes_and_prefills_uniform_padded_batch_all_same(): + """uniform batch where all query lengths are identical with 0 length padded reqs.""" + # All query lengths are 2, with decode_threshold=3 (so 2 <= 3) + # This triggers the padded uniform path at line 891 + query_lens = [2, 2, 2, 0] + padded_num_tokens = 8 + num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = ( + apply_split_decodes_and_prefills(query_lens, 3, True, padded_num_tokens) + ) + # With uniform batch, all requests are treated as decodes + assert num_decodes == 4 + assert num_prefills == 0 + assert num_decode_tokens == padded_num_tokens + assert num_prefill_tokens == 0 + + @pytest.mark.parametrize( "seq_lens,query_lens,split_point,expected_first_reqs,expected_second_reqs", [ diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 6cab129c116c5..4dcaf9d908690 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -106,8 +106,8 @@ def create_common_attn_metadata( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=batch_spec.batch_size, num_actual_tokens=num_tokens, max_query_len=max_query_len, diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index c6c4a5085bff7..1999e9f6c3b99 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -1536,7 +1536,7 @@ def create_scheduler_with_priority( ) kv_transfer_config = ( KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ) @@ -1552,7 +1552,7 @@ def create_scheduler_with_priority( ec_transfer_config = ( ECTransferConfig( - ec_connector="ECSharedStorageConnector", + ec_connector="ECExampleConnector", ec_role=ec_role, ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"}, ) @@ -2413,7 +2413,7 @@ def _assert_right_ec_connector_metadata( metadata_dict = {mm_data.mm_hash: mm_data for mm_data in metadata.mm_datas} # Check all required identifiers exist in metadata; and no extra - # In ECSharedStorageConnector format + # In ECExampleConnector format # NOTE: even having same identifier, the mm_features can be different # since their mm_position can be in different offsets, etc identifiers_dict = {f.identifier for f in mm_features_list} diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index f5ba613d38db1..531b9c595b04d 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -108,7 +108,7 @@ def create_scheduler( ) elif use_kv_connector: kv_transfer_config = KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ) @@ -121,7 +121,7 @@ def create_scheduler( ec_transfer_config = ( ECTransferConfig( - ec_connector="ECSharedStorageConnector", + ec_connector="ECExampleConnector", ec_role=ec_role, ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test"}, ) diff --git a/tests/v1/cudagraph/test_cudagraph_dispatch.py b/tests/v1/cudagraph/test_cudagraph_dispatch.py index b86534d3d4381..0e71d6c63ce68 100644 --- a/tests/v1/cudagraph/test_cudagraph_dispatch.py +++ b/tests/v1/cudagraph/test_cudagraph_dispatch.py @@ -161,10 +161,10 @@ class TestCudagraphDispatcher: assert rt_mode == CUDAGraphMode.NONE assert key == BatchDescriptor(num_tokens=15) - # 4. Cascade attention should have a fall back mode + # 4. disable_full should have a fall back mode (e.g., cascade attention) desc_full_exact = BatchDescriptor(num_tokens=8, uniform=False) rt_mode, key = dispatcher.dispatch( - num_tokens=8, uniform_decode=False, has_lora=False, use_cascade_attn=True + num_tokens=8, uniform_decode=False, has_lora=False, disable_full=True ) if "PIECEWISE" in cudagraph_mode_str: # string contains check assert rt_mode == CUDAGraphMode.PIECEWISE diff --git a/tests/v1/determinism/test_batch_invariance.py b/tests/v1/determinism/test_batch_invariance.py index 4311547baccf1..1c45e7fe366ff 100644 --- a/tests/v1/determinism/test_batch_invariance.py +++ b/tests/v1/determinism/test_batch_invariance.py @@ -10,6 +10,7 @@ from utils import ( BACKENDS, _extract_step_logprobs, _random_prompt, + is_device_capability_below_90, resolve_model_name, skip_unsupported, ) @@ -17,6 +18,8 @@ from utils import ( import vllm.model_executor.layers.batch_invariant as batch_invariant from vllm import LLM, SamplingParams +IS_DEVICE_CAPABILITY_BELOW_90 = is_device_capability_below_90() + @skip_unsupported @pytest.mark.timeout(1000) @@ -185,11 +188,12 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( llm = LLM( model=model_name, tensor_parallel_size=tp_size, - enable_prefix_caching=False, + # enable_prefix_caching=False, max_num_seqs=32, max_model_len=8192, dtype="bfloat16", # not everything is supported gpu_memory_utilization=0.9, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # Use more realistic prompts for better token generation @@ -394,6 +398,7 @@ def test_simple_generation(backend, monkeypatch: pytest.MonkeyPatch): max_model_len=2048, dtype="bfloat16", enable_prefix_caching=False, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) prompt = "the capital of france is" @@ -457,10 +462,10 @@ def test_logprobs_without_batch_invariance_should_fail( llm = LLM( model=model_name, tensor_parallel_size=tp_size, - enable_prefix_caching=False, max_num_seqs=32, max_model_len=8192, dtype="bfloat16", + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # build ragged prompts to change shapes significantly across BS=1 vs BS=N @@ -681,10 +686,10 @@ def test_decode_logprobs_match_prefill_logprobs( llm = LLM( model=model_name, tensor_parallel_size=tp_size, - enable_prefix_caching=False, max_num_seqs=32, max_model_len=8192, dtype="bfloat16", + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, ) # Use a few test prompts @@ -929,6 +934,7 @@ def LLM_with_max_seqs( dtype="bfloat16", tensor_parallel_size=int(os.getenv("VLLM_TP_SIZE", "1")), enable_prefix_caching=False, + enforce_eager=IS_DEVICE_CAPABILITY_BELOW_90, # Enable for MOE models # enable_expert_parallel=True, ) diff --git a/tests/v1/determinism/test_online_batch_invariance.py b/tests/v1/determinism/test_online_batch_invariance.py index d74b435797f8f..5e3b997364949 100644 --- a/tests/v1/determinism/test_online_batch_invariance.py +++ b/tests/v1/determinism/test_online_batch_invariance.py @@ -153,7 +153,10 @@ def test_logprobs_bitwise_batch_invariance_bs1_vs_bsN( } tp_size = os.getenv("VLLM_TP_SIZE", "1") - server_args: list[str] = [] + server_args: list[str] = [ + "--max-model-len=8192", + "--max-num-seqs=32", + ] if tp_size: server_args += ["-tp", tp_size] diff --git a/tests/v1/determinism/utils.py b/tests/v1/determinism/utils.py index 0d7da107728b4..a8013ed229cfc 100644 --- a/tests/v1/determinism/utils.py +++ b/tests/v1/determinism/utils.py @@ -11,12 +11,15 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer skip_unsupported = pytest.mark.skipif( - not (current_platform.is_cuda() and current_platform.has_device_capability(90)), - reason="Requires CUDA and >= Hopper (SM90)", + not (current_platform.is_cuda() and current_platform.has_device_capability(80)), + # Supports testing on Ampere and Ada Lovelace devices. + # Note: For devices with SM < 90, batch invariance does not support CUDA Graphs. + reason="Requires CUDA and >= Ampere (SM80)", ) BACKENDS: list[str] = [ "FLASH_ATTN", + "TRITON_MLA", ] if has_flashinfer(): @@ -96,3 +99,7 @@ def _extract_step_logprobs(request_output): return t, inner.token_ids return None, None + + +def is_device_capability_below_90() -> bool: + return not current_platform.has_device_capability(90) diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 945276376d665..5cef9b33c9984 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config from vllm import SamplingParams from vllm.logprobs import Logprob +from vllm.platforms import current_platform from vllm.sampling_params import StructuredOutputsParams from vllm.v1.metrics.reader import Metric @@ -70,6 +71,18 @@ def test_without_spec_decoding( (True, "uni", True, None, True), ] + if current_platform.is_rocm(): + # On ROCm, Only test with structured_outputs (deterministic) + # and skip chunk_prefill (more variable). + test_configs = [ + cfg + for cfg in test_configs + if not cfg[4] # skip chunk_prefill=True + ] + test_sampling_params = [ + p for p in test_sampling_params if p.get("structured_outputs") is not None + ] + run_tests(monkeypatch, MODEL, test_configs, test_sampling_params) @@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch): (True, "uni", True, spec_config_short, True), ] - run_tests(monkeypatch, MTP_MODEL, test_configs, test_sampling_params) + # On ROCm, use TRITON_ATTN + float32 for better numerical consistency + run_tests( + monkeypatch, + MTP_MODEL, + test_configs, + test_sampling_params, + is_testing_with_spec_decoding=True, + ) @dynamo_config.patch(cache_size_limit=16) @@ -117,13 +137,23 @@ def run_tests( model: str, test_configs: list[tuple], test_sampling_params: list[dict[str, Any]], + is_testing_with_spec_decoding: bool = False, ): """Test consistency of combos of async scheduling, preemption, uni/multiproc executor with spec decoding.""" with monkeypatch.context() as m: # avoid precision errors - m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") + if current_platform.is_rocm(): + if is_testing_with_spec_decoding: + # Use TRITON_ATTN for spec decoding test for consistency + m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN") + else: + m.setenv("VLLM_ATTENTION_BACKEND", "ROCM_AITER_FA") + else: + m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION") + # lock matmul precision to full FP32 (IEEE) + m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee") # m.setenv("VLLM_BATCH_INVARIANT", "1") outputs: list[tuple[str, list, list]] = [] for n, ( @@ -143,6 +173,7 @@ def run_tests( async_scheduling, spec_config, test_prefill_chunking=test_prefill_chunking, + is_testing_with_spec_decoding=is_testing_with_spec_decoding, ) outputs.append(test_results) @@ -172,17 +203,34 @@ def run_tests( name_0=f"baseline=[{baseline_config}], params={params}", name_1=f"config=[{test_config}], params={params}", ) - assert _all_logprobs_match(base_logprobs, test_logprobs) + + # On ROCm with TRITON_ATTN (spec decoding test), skip strict + # logprobs comparison when logprobs are requested + skip_logprobs_check = ( + current_platform.is_rocm() + and params.get("logprobs") + and is_testing_with_spec_decoding + ) + if not skip_logprobs_check: + assert _all_logprobs_match(base_logprobs, test_logprobs) if ( base_acceptance_rate is not None and test_acceptance_rate is not None ): if "spec_mml=None" in test_config: + # Preemption causes more variance in acceptance rates + if ( + current_platform.is_rocm() + and "preemption=True" in test_config + ): + tolerance = 0.10 + else: + tolerance = 0.05 assert ( test_acceptance_rate > base_acceptance_rate or test_acceptance_rate - == pytest.approx(base_acceptance_rate, rel=5e-2) + == pytest.approx(base_acceptance_rate, rel=tolerance) ) else: # Currently the reported acceptance rate is expected to be @@ -213,6 +261,7 @@ def run_test( async_scheduling: bool, spec_config: dict[str, Any] | None, test_prefill_chunking: bool, + is_testing_with_spec_decoding: bool = False, ): spec_decoding = spec_config is not None cache_arg: dict[str, Any] = ( @@ -231,6 +280,15 @@ def run_test( print("-" * 80) print(f"---- TESTING {test_str}: {test_config}") print("-" * 80) + + # On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for + # spec decoding test (TRITON_ATTN) for better precision. + # On others: always use float32. + if current_platform.is_rocm() and not is_testing_with_spec_decoding: + dtype = "float16" + else: + dtype = "float32" + with VllmRunner( model, max_model_len=512, @@ -240,7 +298,7 @@ def run_test( # enforce_eager=True, async_scheduling=async_scheduling, distributed_executor_backend=executor, - dtype="float32", # avoid precision errors + dtype=dtype, speculative_config=spec_config, disable_log_stats=False, **cache_arg, @@ -300,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool: def _logprobs_match(lps_a: dict[int, Logprob], lps_b: dict[int, Logprob]) -> bool: - return len(lps_a) == len(lps_b) and all( - a.decoded_token == b.decoded_token - and a.rank == b.rank - and a.logprob == pytest.approx(b.logprob, rel=1e-3, abs=1e-6) - for a, b in ((lps_a[x], lps_b[x]) for x in lps_a) + if current_platform.is_rocm(): + # ROCm has higher numerical variance + # due to use of float16. + rel_tol, abs_tol = 5e-2, 1e-5 + else: + rel_tol, abs_tol = 1e-3, 1e-6 + return ( + len(lps_a) == len(lps_b) + and lps_a.keys() == lps_b.keys() + and all( + a.decoded_token == b.decoded_token + and a.rank == b.rank + and a.logprob == pytest.approx(b.logprob, rel=rel_tol, abs=abs_tol) + for a, b in ((lps_a[x], lps_b[x]) for x in lps_a) + ) ) diff --git a/tests/v1/e2e/test_async_spec_decode.py b/tests/v1/e2e/test_async_spec_decode.py new file mode 100644 index 0000000000000..561f37a52d573 --- /dev/null +++ b/tests/v1/e2e/test_async_spec_decode.py @@ -0,0 +1,131 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Test that verifies no implicit GPU-CPU synchronization occurs during +speculative decoding generation under expected conditions. +""" + +import multiprocessing +import sys +import traceback + +import pytest +import torch + + +@pytest.fixture +def sync_tracker(): + """ + Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect + lazy init syncs. Prints stack traces immediately when syncs occur. + """ + from vllm.v1.attention.backends.utils import CommonAttentionMetadata + + # Shared counter for cross-process communication (inherited by fork) + sync_count = multiprocessing.Value("i", 0) + + # Save original property + original_prop = CommonAttentionMetadata.seq_lens_cpu + original_fget = original_prop.fget + + # Create tracking wrapper + def tracking_seq_lens_cpu(self): + if self._seq_lens_cpu is None: + # Increment counter + with sync_count.get_lock(): + sync_count.value += 1 + count = sync_count.value + # Print stack trace immediately (shows in subprocess output) + print(f"\n{'=' * 60}", file=sys.stderr) + print(f"SYNC #{count}: seq_lens_cpu lazy init triggered!", file=sys.stderr) + print(f"{'=' * 60}", file=sys.stderr) + traceback.print_stack(file=sys.stderr) + print(f"{'=' * 60}\n", file=sys.stderr) + sys.stderr.flush() + return original_fget(self) + + # Apply patch + CommonAttentionMetadata.seq_lens_cpu = property(tracking_seq_lens_cpu) + + class SyncTracker: + @property + def count(self) -> int: + return sync_count.value + + def assert_no_sync(self, msg: str = ""): + count = sync_count.value + assert count == 0, ( + f"Unexpected GPU-CPU sync: seq_lens_cpu lazy init triggered " + f"{count} times. See stack traces above. {msg}" + ) + + yield SyncTracker() + + # Restore original property + CommonAttentionMetadata.seq_lens_cpu = original_prop + torch._dynamo.reset() + + +# Test configurations: (model, spec_model, method, num_spec_tokens, backend_env) +SPEC_DECODE_CONFIGS = [ + pytest.param( + "meta-llama/Llama-3.2-1B-Instruct", + "nm-testing/Llama3_2_1B_speculator.eagle3", + "eagle3", + 2, + id="eagle3-llama", + ), + pytest.param( + "eagle618/deepseek-v3-random", + "eagle618/eagle-deepseek-v3-random", + "eagle", + 2, + id="eagle-mla-deepseek", + ), +] + + +@pytest.mark.parametrize( + "model,spec_model,method,num_spec_tokens", + SPEC_DECODE_CONFIGS, +) +def test_no_sync_with_spec_decode( + sync_tracker, + model: str, + spec_model: str, + method: str, + num_spec_tokens: int, +): + """ + Test that no implicit GPU-CPU sync occurs during speculative decoding + generation. + """ + # Import vLLM AFTER sync_tracker fixture has applied the patch + from vllm import LLM, SamplingParams + from vllm.distributed import cleanup_dist_env_and_memory + + llm = LLM( + model=model, + max_model_len=256, + speculative_config={ + "method": method, + "num_speculative_tokens": num_spec_tokens, + "model": spec_model, + }, + enforce_eager=True, + async_scheduling=True, + ) + + outputs = llm.generate( + ["Hello, my name is"], + SamplingParams(temperature=0, max_tokens=10), + ) + + assert len(outputs) == 1 + assert len(outputs[0].outputs[0].text) > 0 + + del llm + torch.cuda.empty_cache() + cleanup_dist_env_and_memory() + + sync_tracker.assert_no_sync() diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py index 416b582dfaa63..c8587659d6580 100644 --- a/tests/v1/e2e/test_spec_decode.py +++ b/tests/v1/e2e/test_spec_decode.py @@ -191,8 +191,8 @@ def test_suffix_decoding_acceptance( # Expect the acceptance rate to improve. assert first_accept_rate < last_accept_rate - # Heuristic: expect at least 82.5% acceptance rate at the end. - assert last_accept_rate > 0.825 + # Heuristic: expect at least 80.0% acceptance rate at the end. + assert last_accept_rate > 0.80 del spec_llm torch.cuda.empty_cache() @@ -280,9 +280,20 @@ def test_speculators_model_integration( @pytest.mark.parametrize( - ["model_setup", "mm_enabled", "enable_chunked_prefill"], + ["model_setup", "mm_enabled", "enable_chunked_prefill", "model_impl"], [ - (("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), False, False), + ( + ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), + False, + False, + "auto", + ), + ( + ("eagle3", "Qwen/Qwen3-8B", "AngelSlim/Qwen3-8B_eagle3", 1), + False, + False, + "transformers", + ), pytest.param( ( "eagle3", @@ -292,6 +303,7 @@ def test_speculators_model_integration( ), False, False, + "auto", marks=pytest.mark.skip( reason="architecture of its eagle3 is LlamaForCausalLMEagle3" ), @@ -305,6 +317,7 @@ def test_speculators_model_integration( ), False, False, + "auto", marks=pytest.mark.skip( reason="Skipping due to its head_dim not being a a multiple of 32" ), @@ -318,6 +331,7 @@ def test_speculators_model_integration( ), False, True, + "auto", marks=large_gpu_mark(min_gb=40), ), # works on 4x H100 ( @@ -329,6 +343,7 @@ def test_speculators_model_integration( ), False, False, + "auto", ), pytest.param( ( @@ -339,6 +354,7 @@ def test_speculators_model_integration( ), False, False, + "auto", marks=large_gpu_mark(min_gb=80), ), # works on 4x H100 pytest.param( @@ -350,6 +366,7 @@ def test_speculators_model_integration( ), True, True, + "auto", marks=large_gpu_mark(min_gb=80), ), # works on 4x H100 ( @@ -361,10 +378,12 @@ def test_speculators_model_integration( ), False, False, + "auto", ), ], ids=[ "qwen3_eagle3", + "qwen3_eagle3-transformers", "qwen3_vl_eagle3", "qwen2_5_vl_eagle3", "llama3_eagle", @@ -381,6 +400,7 @@ def test_eagle_correctness( model_setup: tuple[str, str, str, int], mm_enabled: bool, enable_chunked_prefill: bool, + model_impl: str, attn_backend: str, ): if attn_backend == "TREE_ATTN": @@ -389,6 +409,17 @@ def test_eagle_correctness( "TREE_ATTN is flaky in the test disable for now until it can be " "resolved (see https://github.com/vllm-project/vllm/issues/22922)" ) + if model_impl == "transformers": + import transformers + from packaging.version import Version + + installed = Version(transformers.__version__) + required = Version("5.0.0.dev") + if installed < required: + pytest.skip( + "Eagle3 with the Transformers modeling backend requires " + f"transformers>={required}, but got {installed}" + ) # Generate test prompts inside the function instead of using fixture test_prompts = get_test_prompts(mm_enabled) @@ -448,6 +479,7 @@ def test_eagle_correctness( max_model_len=max_model_len, max_num_batched_tokens=max_num_batched_tokens, enable_chunked_prefill=enable_chunked_prefill, + model_impl=model_impl, ) spec_outputs = spec_llm.chat(test_prompts, sampling_config) matches = 0 diff --git a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh index 55dd39c0a957f..0c2666306558c 100644 --- a/tests/v1/ec_connector/integration/run_epd_correctness_test.sh +++ b/tests/v1/ec_connector/integration/run_epd_correctness_test.sh @@ -148,7 +148,7 @@ run_epd_1e_1pd() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -167,7 +167,7 @@ run_epd_1e_1pd() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -348,7 +348,7 @@ run_epd_1e_1p_1d() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_producer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" @@ -369,7 +369,7 @@ run_epd_1e_1p_1d() { --max-num-seqs 128 \ --allowed-local-media-path ${GIT_ROOT}/tests/v1/ec_connector/integration \ --ec-transfer-config '{ - "ec_connector": "ECSharedStorageConnector", + "ec_connector": "ECExampleConnector", "ec_role": "ec_consumer", "ec_connector_extra_config": { "shared_storage_path": "'"$EC_SHARED_STORAGE_PATH"'" diff --git a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py b/tests/v1/ec_connector/unit/test_ec_example_connector.py similarity index 90% rename from tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py rename to tests/v1/ec_connector/unit/test_ec_example_connector.py index a58daa2628e21..7e9eb21310031 100644 --- a/tests/v1/ec_connector/unit/test_ec_shared_storage_connector.py +++ b/tests/v1/ec_connector/unit/test_ec_example_connector.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Unit tests for ECSharedStorageConnector. +Unit tests for ECExampleConnector. """ import os @@ -13,9 +13,9 @@ import torch from vllm.config import VllmConfig from vllm.distributed.ec_transfer.ec_connector.base import ECConnectorRole -from vllm.distributed.ec_transfer.ec_connector.shared_storage_connector import ( - ECSharedStorageConnector, - ECSharedStorageConnectorMetadata, +from vllm.distributed.ec_transfer.ec_connector.example_connector import ( + ECExampleConnector, + ECExampleConnectorMetadata, MMMeta, ) from vllm.multimodal.inputs import MultiModalFeatureSpec, PlaceholderRange @@ -81,12 +81,12 @@ def mock_request_with_3_mm(): # ------------------ Unit Tests ------------------ # -class TestECSharedStorageConnectorBasics: +class TestECExampleConnectorBasics: """Test basic EC connector functionality.""" def test_initialization_producer(self, mock_vllm_config_producer, temp_storage): """Test connector initializes correctly as producer.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -98,7 +98,7 @@ class TestECSharedStorageConnectorBasics: def test_initialization_consumer(self, mock_vllm_config_consumer, temp_storage): """Test connector initializes correctly as consumer.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) @@ -109,11 +109,11 @@ class TestECSharedStorageConnectorBasics: def test_role_assignment(self, mock_vllm_config_producer): """Test role is correctly assigned.""" - scheduler_connector = ECSharedStorageConnector( + scheduler_connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) - worker_connector = ECSharedStorageConnector( + worker_connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -133,7 +133,7 @@ class TestCacheExistence: ): """Test has_caches returns True when all 3 caches exist.""" # Test for producer first - producer = ECSharedStorageConnector( + producer = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -154,7 +154,7 @@ class TestCacheExistence: assert all(producer_result), f"Expected all True, got {producer_result}" # Also test consumer can check if cache exists - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.SCHEDULER, ) @@ -170,7 +170,7 @@ class TestCacheExistence: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test has_caches returns False when no caches exist.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -186,7 +186,7 @@ class TestCacheExistence: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test has_caches with some caches existing (1 of 3).""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -213,7 +213,7 @@ class TestStateManagement: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test state update after allocation for 3 MM items.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -238,7 +238,7 @@ class TestStateManagement: self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test metadata building for 3 MM items.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -252,7 +252,7 @@ class TestStateManagement: metadata = connector.build_connector_meta(scheduler_output) # Assert - assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert isinstance(metadata, ECExampleConnectorMetadata) assert len(metadata.mm_datas) == 3 assert metadata.mm_datas[0].mm_hash == "img_hash_1" assert metadata.mm_datas[0].num_token == 100 @@ -266,7 +266,7 @@ class TestStateManagement: def test_build_connector_meta_empty(self, mock_vllm_config_producer): """Test metadata building with empty state.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -274,14 +274,14 @@ class TestStateManagement: scheduler_output = Mock(spec=SchedulerOutput) metadata = connector.build_connector_meta(scheduler_output) - assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert isinstance(metadata, ECExampleConnectorMetadata) assert len(metadata.mm_datas) == 0 def test_state_cleared_after_metadata_build( self, mock_vllm_config_producer, mock_request_with_3_mm ): """Test that state is properly cleared after building metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) @@ -310,7 +310,7 @@ class TestCacheSaving: self, mock_vllm_config_producer, mock_request_with_3_mm, temp_storage ): """Test cache saving as producer for 3 different MM items.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -336,7 +336,7 @@ class TestCacheSaving: def test_save_caches_consumer_skips(self, mock_vllm_config_consumer): """Test cache saving is skipped for consumer.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) @@ -366,7 +366,7 @@ class TestCacheLoading: ): """Test consumer loads 3 caches from storage.""" # First, create producer to save caches - producer = ECSharedStorageConnector( + producer = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -379,13 +379,13 @@ class TestCacheLoading: producer.save_caches(saved_caches, mm_hash) # Now consumer loads - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) # Setup metadata for all 3 - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() for mm_hash in mm_hashes: metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100)) consumer.bind_connector_metadata(metadata) @@ -410,7 +410,7 @@ class TestCacheLoading: ): """Test cache loading skips already cached items.""" # Setup: producer saves cache - producer = ECSharedStorageConnector( + producer = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -420,12 +420,12 @@ class TestCacheLoading: producer.save_caches({mm_hash: saved_cache}, mm_hash) # Consumer setup - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() metadata.add_mm_data(MMMeta.make_meta(mm_hash, 100)) consumer.bind_connector_metadata(metadata) @@ -444,13 +444,13 @@ class TestCacheLoading: def test_start_load_caches_empty_metadata(self, mock_vllm_config_consumer): """Test loading with empty metadata does nothing.""" - consumer = ECSharedStorageConnector( + consumer = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) # Setup empty metadata - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() consumer.bind_connector_metadata(metadata) # Load (should not raise) @@ -466,7 +466,7 @@ class TestFilenameGeneration: def test_generate_foldername(self, mock_vllm_config_producer, temp_storage): """Test folder name generation.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -479,7 +479,7 @@ class TestFilenameGeneration: def test_generate_filename(self, mock_vllm_config_producer, temp_storage): """Test filename generation.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -493,7 +493,7 @@ class TestFilenameGeneration: def test_generate_filename_consistency(self, mock_vllm_config_producer): """Test filename generation is consistent.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -510,12 +510,12 @@ class TestMetadataBindingLifecycle: def test_bind_connector_metadata(self, mock_vllm_config_consumer): """Test binding connector metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() metadata.add_mm_data(MMMeta.make_meta("hash_1", 100)) connector.bind_connector_metadata(metadata) @@ -524,12 +524,12 @@ class TestMetadataBindingLifecycle: def test_clear_connector_metadata(self, mock_vllm_config_consumer): """Test clearing connector metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() connector.bind_connector_metadata(metadata) connector.clear_connector_metadata() @@ -538,12 +538,12 @@ class TestMetadataBindingLifecycle: def test_get_connector_metadata(self, mock_vllm_config_consumer): """Test getting connector metadata.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() connector.bind_connector_metadata(metadata) retrieved = connector._get_connector_metadata() @@ -552,7 +552,7 @@ class TestMetadataBindingLifecycle: def test_get_connector_metadata_not_set(self, mock_vllm_config_consumer): """Test getting metadata when not set raises.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) @@ -566,7 +566,7 @@ class TestEdgeCases: def test_save_empty_cache(self, mock_vllm_config_producer): """Test saving empty tensor.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.WORKER, ) @@ -579,12 +579,12 @@ class TestEdgeCases: def test_load_nonexistent_cache(self, mock_vllm_config_consumer): """Test loading cache that doesn't exist raises error.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_consumer, role=ECConnectorRole.WORKER, ) - metadata = ECSharedStorageConnectorMetadata() + metadata = ECExampleConnectorMetadata() metadata.add_mm_data(MMMeta.make_meta("nonexistent_hash", 100)) connector.bind_connector_metadata(metadata) @@ -596,7 +596,7 @@ class TestEdgeCases: def test_has_caches_empty_request(self, mock_vllm_config_producer): """Test has_caches with request that has no MM data.""" - connector = ECSharedStorageConnector( + connector = ECExampleConnector( vllm_config=mock_vllm_config_producer, role=ECConnectorRole.SCHEDULER, ) diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 48be8c15aba9e..5fa16897b4e0c 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -507,7 +507,7 @@ def test_encoder_instance_zero_kv_cache( ) kv_transfer_config = ( KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": "local_storage"}, ) @@ -515,7 +515,7 @@ def test_encoder_instance_zero_kv_cache( else None ) ec_transfer_config = ECTransferConfig( - ec_connector="ECSharedStorageConnector", + ec_connector="ECExampleConnector", ec_role=ec_role, ec_connector_extra_config={"shared_storage_path": "/tmp/ec_test_encoder"}, ) diff --git a/tests/v1/engine/test_init_error_messaging.py b/tests/v1/engine/test_init_error_messaging.py new file mode 100644 index 0000000000000..bc23a68f9deb1 --- /dev/null +++ b/tests/v1/engine/test_init_error_messaging.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest + +from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory +from vllm.v1.kv_cache_interface import FullAttentionSpec + + +def test_kv_cache_oom_no_memory(): + from unittest.mock import MagicMock + + config = MagicMock() + config.model_config.max_model_len = 2048 + + spec = { + "layer_0": FullAttentionSpec( + block_size=16, + num_kv_heads=8, + head_size=128, + dtype="float16", + ) + } + + with pytest.raises(ValueError): + check_enough_kv_cache_memory(config, spec, 0) + + +def test_kv_cache_oom_insufficient_memory(monkeypatch): + from unittest.mock import MagicMock + + config = MagicMock() + config.model_config.max_model_len = 2048 + config.cache_config.block_size = 16 + config.parallel_config.tensor_parallel_size = 1 + config.parallel_config.pipeline_parallel_size = 1 + config.parallel_config.decode_context_parallel_size = 1 + + monkeypatch.setattr( + "vllm.v1.core.kv_cache_utils.max_memory_usage_bytes", + lambda c, s: 100 * 1024**3, # 100 GiB + ) + + spec = { + "layer_0": FullAttentionSpec( + block_size=16, + num_kv_heads=8, + head_size=128, + dtype="float16", + ) + } + + with pytest.raises(ValueError): + check_enough_kv_cache_memory(config, spec, 1024**3) # 1 GiB diff --git a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py index 5768fcdb57ceb..b92d3fcd6fb8b 100644 --- a/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py +++ b/tests/v1/kv_connector/nixl_integration/toy_proxy_server.py @@ -30,7 +30,14 @@ async def lifespan(app: FastAPI): prefiller_base_url = f"http://{host}:{port}/v1" app.state.prefill_clients.append( { - "client": httpx.AsyncClient(timeout=None, base_url=prefiller_base_url), + "client": httpx.AsyncClient( + timeout=None, + base_url=prefiller_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ), "host": host, "port": port, "id": i, @@ -42,7 +49,14 @@ async def lifespan(app: FastAPI): decoder_base_url = f"http://{host}:{port}/v1" app.state.decode_clients.append( { - "client": httpx.AsyncClient(timeout=None, base_url=decoder_base_url), + "client": httpx.AsyncClient( + timeout=None, + base_url=decoder_base_url, + limits=httpx.Limits( + max_connections=None, + max_keepalive_connections=None, + ), + ), "host": host, "port": port, "id": i, @@ -169,6 +183,10 @@ async def send_request_to_service( ) response.raise_for_status() + # read/consume the response body to release the connection + # otherwise, it would http.ReadError + await response.aread() + return response @@ -206,6 +224,7 @@ async def _handle_completions(api: str, request: Request): # Extract the needed fields response_json = response.json() + await response.aclose() # CRITICAL: Release connection back to pool kv_transfer_params = response_json.get("kv_transfer_params", {}) if kv_transfer_params: req_data["kv_transfer_params"] = kv_transfer_params diff --git a/tests/v1/kv_connector/unit/test_backwards_compatibility.py b/tests/v1/kv_connector/unit/test_backwards_compatibility.py index 7cd23805c599d..0d29ca5fca5e5 100644 --- a/tests/v1/kv_connector/unit/test_backwards_compatibility.py +++ b/tests/v1/kv_connector/unit/test_backwards_compatibility.py @@ -218,12 +218,12 @@ def test_internal_connector_uses_new_signature(): Test that internal connectors (registered in factory) always use the new signature and get kv_cache_config. """ - from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( - SharedStorageConnector, + from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import ( + ExampleConnector, ) vllm_config = create_vllm_config() - vllm_config.kv_transfer_config.kv_connector = "SharedStorageConnector" + vllm_config.kv_transfer_config.kv_connector = "ExampleConnector" scheduler = create_scheduler(vllm_config) kv_cache_config = scheduler.kv_cache_config @@ -233,7 +233,7 @@ def test_internal_connector_uses_new_signature(): ) assert connector is not None - assert isinstance(connector, SharedStorageConnector) + assert isinstance(connector, ExampleConnector) assert connector._kv_cache_config is not None assert connector._kv_cache_config == kv_cache_config diff --git a/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py new file mode 100644 index 0000000000000..ec3fb8231e19e --- /dev/null +++ b/tests/v1/kv_connector/unit/test_cache_pollution_prevention.py @@ -0,0 +1,163 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +test that invalid blocks are evicted from prefix cache to prevent pollution. + +verifies that when sync-loading fails, invalid blocks are removed from the +prefix cache hash table so future requests cannot match and reuse corrupted data. +""" + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +def test_invalid_blocks_evicted_prevents_cache_pollution( + fail_scheduler: Scheduler, +): + """ + verify invalid blocks are evicted to prevent future cache hits. + + scenario: + 1. request 1 loads externally-computed blocks (sync mode) + 2. some blocks fail to load and are marked invalid + 3. with fail policy, invalid blocks should be evicted from prefix cache + 4. request is marked as FINISHED_ERROR + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + # request 1: will have invalid blocks + request1 = create_request(num_tokens=num_prompt_tokens, request_id=1) + fail_scheduler.add_request(request=request1) + + req_num_new_matched_tokens = { + request1.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + # request should be running with sync KV load + assert len(fail_scheduler.running) == 1 + assert request1.status == RequestStatus.RUNNING + + # get allocated block IDs + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # get the block object to verify eviction later + block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + + # cache the blocks to simulate they've been computed and cached + # (in real scenario blocks would be cached after compute) + fail_scheduler.kv_cache_manager.cache_blocks(request1, num_external_computed_tokens) + + # verify block has a hash (is cached) before reporting invalid blocks + assert block.block_hash is not None, ( + f"block {invalid_block_id} should be cached (have a hash) before " + f"eviction test, but hash is None" + ) + + # report invalid blocks + model_runner_output = create_model_runner_output( + [request1], + invalid_block_ids=invalid_block_ids, + use_eos=False, + ) + + fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify request finished with error (fail policy) + assert request1.status == RequestStatus.FINISHED_ERROR + + # critical assertion: invalid block and all subsequent blocks should be evicted + # all blocks from invalid_block_idx onwards become invalid since they were + # computed based on the failed block + for idx in range(invalid_block_idx, len(req_block_ids)): + block_id = req_block_ids[idx] + block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id] + assert block_obj.block_hash is None, ( + f"block {block_id} at index {idx} should have been evicted " + f"(hash reset to None), but hash is {block_obj.block_hash}. " + f"All blocks from index {invalid_block_idx} onwards should be evicted " + f"since they depend on the invalid block at index {invalid_block_idx}." + ) + + # verify cache contains exactly the valid blocks (before first affected block) + # and none of the invalid blocks (from first affected block onwards) + + # valid blocks: all blocks before invalid_block_idx should be cached + for idx in range(invalid_block_idx): + block_id = req_block_ids[idx] + block_obj = fail_scheduler.kv_cache_manager.block_pool.blocks[block_id] + assert block_obj.block_hash is not None, ( + f"valid block {block_id} at index {idx} should still be cached " + f"(have a hash), but hash is None. Only blocks from index " + f"{invalid_block_idx} onwards should be evicted." + ) + + # invalid blocks: verify they're not in the cached_block_hash_to_block map + cached_blocks = ( + fail_scheduler.kv_cache_manager.block_pool.cached_block_hash_to_block + ) + cached_block_ids = { + b.block_id + for blocks_val in cached_blocks._cache.values() + for b in ( + [blocks_val] if not isinstance(blocks_val, dict) else blocks_val.values() + ) + } + + for idx in range(invalid_block_idx, len(req_block_ids)): + block_id = req_block_ids[idx] + assert block_id not in cached_block_ids, ( + f"invalid block {block_id} at index {idx} should not be in cache hash table" + ) diff --git a/tests/v1/kv_connector/unit/test_error_propagation.py b/tests/v1/kv_connector/unit/test_error_propagation.py new file mode 100644 index 0000000000000..20e181f379f5c --- /dev/null +++ b/tests/v1/kv_connector/unit/test_error_propagation.py @@ -0,0 +1,147 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import FinishReason, Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +def test_error_propagation_sync_load(fail_scheduler: Scheduler): + """test invalid_block_ids with fail policy -> FINISHED_ERROR (sync load)""" + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + assert len(fail_scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + assert fail_scheduler.connector.get_num_new_matched_tokens.call_count == 1 + + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_ids = {req_block_ids[invalid_block_idx]} + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + assert len(fail_scheduler.running) == 0 + + +def test_error_propagation_async_load(fail_scheduler: Scheduler): + """test invalid_block_ids with fail policy -> FINISHED_ERROR (async load)""" + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + assert len(fail_scheduler.waiting) == 1 + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.num_computed_tokens == 0 + + (req_block_ids,) = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id) + invalid_block_ids = {req_block_ids[invalid_block_idx]} + model_runner_output = create_model_runner_output( + reqs=[], + finished_recving=set(), + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + assert len(fail_scheduler.waiting) == 0 diff --git a/tests/v1/kv_connector/unit/test_shared_storage_connector.py b/tests/v1/kv_connector/unit/test_example_connector.py similarity index 97% rename from tests/v1/kv_connector/unit/test_shared_storage_connector.py rename to tests/v1/kv_connector/unit/test_example_connector.py index ff4697a978255..75edb79fb4af4 100644 --- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py +++ b/tests/v1/kv_connector/unit/test_example_connector.py @@ -119,16 +119,16 @@ def process_prompt(processor, llm: LLM, question: str, image_urls: list[Image]): ) def test_shared_storage_connector_hashes(tmp_path): """ - Tests that SharedStorageConnector saves KV to the storage locations + Tests that ExampleConnector saves KV to the storage locations with proper hashes; that are unique for inputs with identical text but different images (same size), or same multiple images but different orders. """ # Using tmp_path as the storage path to store KV print(f"KV storage path at: {str(tmp_path)}") - # Configure the SharedStorageConnector + # Configure the ExampleConnector kv_transfer_config = KVTransferConfig( - kv_connector="SharedStorageConnector", + kv_connector="ExampleConnector", kv_role="kv_both", kv_connector_extra_config={"shared_storage_path": str(tmp_path)}, ) diff --git a/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py new file mode 100644 index 0000000000000..940f3a98308b6 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_invalid_blocks_correctness.py @@ -0,0 +1,454 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +""" +Tests for correctness in invalid block handling. + +These tests verify correct behavior in three scenarios: +1. Sync recompute case: Blocks should not be freed for running requests + that need to recompute invalid blocks +2. Sync fail case: Invalid blocks must be evicted from cache when request fails +3. Async recompute case: Invalid blocks should not be cached after transfer +""" + +from collections.abc import Callable +from unittest.mock import Mock + +import pytest + +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import FinishReason, Request, RequestStatus + +from .utils import ( + create_model_runner_output, + create_request, + create_scheduler, + create_vllm_config, +) + +pytestmark = pytest.mark.cpu_test + + +def _make_get_num_new_matched_tokens( + req_num_new_matched_tokens: dict[str, int], + async_load: bool, +) -> Callable[[Request, int], tuple[int, bool]]: + def get_num_new_matched_tokens(request: Request, _: int) -> tuple[int, bool]: + value = req_num_new_matched_tokens.get(request.request_id, 0) + return value, async_load + + return get_num_new_matched_tokens + + +@pytest.fixture +def fail_scheduler(): + """scheduler with kv_load_failure_policy='fail'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "fail" + return create_scheduler(vllm_config) + + +@pytest.fixture +def recompute_scheduler(): + """scheduler with kv_load_failure_policy='recompute'""" + vllm_config = create_vllm_config() + vllm_config.kv_transfer_config.kv_load_failure_policy = "recompute" + return create_scheduler(vllm_config) + + +def test_sync_recompute_blocks_not_freed_for_running_requests( + recompute_scheduler: Scheduler, +): + """ + Test sync recompute case - blocks must not be freed for running requests. + + When a running request has invalid blocks and retry_policy is 'recompute': + 1. Request should remain in RUNNING state + 2. num_computed_tokens should be truncated to invalid block boundary + 3. Blocks should NOT be freed (request still needs them for recomputation) + 4. Request should remain in scheduler.requests and scheduler.running + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * recompute_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + recompute_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + recompute_scheduler.connector = Mock() + recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + recompute_scheduler.connector.request_finished.return_value = (False, None) + recompute_scheduler.connector.take_events.return_value = () + + scheduler_output = recompute_scheduler.schedule() + + # request should be running with sync KV load + assert len(recompute_scheduler.running) == 1 + assert len(scheduler_output.scheduled_new_reqs) == 1 + assert request.status == RequestStatus.RUNNING + + # get the allocated block IDs before invalid blocks are reported + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_ids = {req_block_ids[invalid_block_idx]} + + # store original num_computed_tokens for comparison + original_num_computed_tokens = request.num_computed_tokens + + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=False, # not finished - should continue running + ) + + outputs = recompute_scheduler.update_from_output( + scheduler_output, model_runner_output + ) + + # critical assertions for recompute case: + + # 1. request should still be RUNNING (not finished, not aborted) + assert request.status == RequestStatus.RUNNING, ( + f"Request should remain RUNNING for recompute, got {request.status}" + ) + + # 2. num_computed_tokens should be truncated to first invalid block + expected_truncated_tokens = invalid_block_idx * recompute_scheduler.block_size + assert request.num_computed_tokens == expected_truncated_tokens, ( + f"num_computed_tokens should be truncated to {expected_truncated_tokens}, " + f"got {request.num_computed_tokens}" + ) + assert request.num_computed_tokens < original_num_computed_tokens, ( + "num_computed_tokens should be reduced after invalid block detection" + ) + + # 3. no output should be generated (request is still running) + # the request should be skipped in the output loop + assert len(outputs) == 0 or request.request_id not in [ + out.request_id for outs in outputs.values() for out in outs.outputs + ], "No output should be generated for recompute requests" + + # 4. request should still be in running queue + assert request in recompute_scheduler.running, ( + "Request should remain in running queue for recomputation" + ) + + # 5. request should still be in scheduler.requests (not deleted) + assert request.request_id in recompute_scheduler.requests, ( + "Request should not be deleted from scheduler.requests" + ) + + # 6. blocks should NOT be freed - verify blocks are still allocated + try: + allocated_blocks = recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + assert allocated_blocks is not None + assert len(allocated_blocks[0]) > 0, ( + "Blocks should still be allocated for recomputation" + ) + except KeyError: + pytest.fail( + "Blocks were freed incorrectly! Running requests need their blocks " + "to recompute invalid portions." + ) + + # 7. verify request can be rescheduled in next step + scheduler_output_2 = recompute_scheduler.schedule() + + # request should appear in the new schedule to recompute invalid blocks + scheduled_req_ids = [ + req.request_id for req in scheduler_output_2.scheduled_new_reqs + ] + if scheduler_output_2.num_scheduled_tokens: + scheduled_req_ids.extend(scheduler_output_2.num_scheduled_tokens.keys()) + + assert ( + request.request_id in scheduled_req_ids or len(recompute_scheduler.running) > 0 + ), "Request should be reschedulable for recomputation" + + +def test_sync_fail_invalid_blocks_evicted(fail_scheduler: Scheduler): + """ + Test sync fail case - invalid blocks must be evicted from cache. + + When a request fails with policy='fail' and has invalid blocks from sync loading: + 1. Request should be finished with FINISHED_ERROR + 2. Invalid blocks should be evicted from the KV cache + 3. Valid blocks (if shared) should remain in cache + 4. Future requests should not reuse the invalid blocks + + This test verifies that invalid blocks are properly evicted to prevent + cache corruption and reuse of invalid data. + """ + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * fail_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * fail_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + fail_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating sync load + fail_scheduler.connector = Mock() + fail_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, False) + ) + fail_scheduler.connector.request_finished.return_value = (False, None) + fail_scheduler.connector.take_events.return_value = () + + scheduler_output = fail_scheduler.schedule() + + # request should be running with sync KV load + assert len(fail_scheduler.running) == 1 + assert request.status == RequestStatus.RUNNING + + # get allocated block IDs + req_block_ids = scheduler_output.scheduled_new_reqs[0].block_ids[0] + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # verify the block is in the block pool before we report it as invalid + block = fail_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + assert block is not None + + # report invalid blocks - request should fail + model_runner_output = create_model_runner_output( + [request], + invalid_block_ids=invalid_block_ids, + use_eos=True, + ) + + outputs = fail_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify request is finished with error + assert request.status == RequestStatus.FINISHED_ERROR + assert request.get_finished_reason() == FinishReason.ERROR + + # verify output is generated + assert len(outputs) == 1 + engine_outputs = next(iter(outputs.values())) + assert len(engine_outputs.outputs) == 1 + output = engine_outputs.outputs[0] + assert output.request_id == request.request_id + assert output.finish_reason == FinishReason.ERROR + + # verify the request was removed from scheduler + assert request.request_id not in fail_scheduler.requests + assert len(fail_scheduler.running) == 0 + + # critical: verify invalid block was actually freed from cache + # this is the key assertion - the invalid block should no longer be + # tracked by the KV cache manager for this request + # if it's still there, a future request could reuse the invalid data + try: + block_ids = fail_scheduler.kv_cache_manager.get_block_ids(request.request_id) + # if we get here, check if blocks were actually freed + if block_ids is not None and len(block_ids[0]) > 0: + pytest.fail( + f"Invalid blocks still tracked for finished request! " + f"Request {request.request_id} should have been freed but " + f"still has {len(block_ids[0])} blocks allocated." + ) + # blocks list exists but is empty - this is fine, they were freed + except KeyError: + # expected - request completely removed from tracking + pass + + # critical: verify invalid block was evicted from prefix cache + # the block should no longer have a hash (hash is reset on eviction) + assert block.block_hash is None, ( + f"Invalid block {invalid_block_id} should have been evicted from cache " + f"(hash should be None), but hash is still {block.block_hash}" + ) + + +def test_async_recompute_blocks_not_cached_when_invalid( + recompute_scheduler: Scheduler, +): + """ + Test async recompute case - invalid blocks not cached after transfer. + + When async KV loading has invalid blocks and retry_policy is 'recompute': + 1. Blocks are allocated but not cached yet + 2. When async transfer completes, only valid blocks should be cached + 3. Invalid blocks should never enter the prefix cache + + This test verifies correctness, the failed_recving_kv_req_ids protection + ensures only valid blocks are cached when the transfer completes, and we + only evict blocks from cache that are already hashed in the block table. + """ + from unittest.mock import patch + + num_prompt_blocks = 100 + num_external_computed_blocks = 99 + invalid_block_idx = 50 + + num_prompt_tokens = num_prompt_blocks * recompute_scheduler.block_size + num_external_computed_tokens = ( + num_external_computed_blocks * recompute_scheduler.block_size + ) + + request = create_request(num_tokens=num_prompt_tokens) + recompute_scheduler.add_request(request=request) + + req_num_new_matched_tokens = { + request.request_id: num_external_computed_tokens, + } + + # mock connector indicating async load + recompute_scheduler.connector = Mock() + recompute_scheduler.connector.get_num_new_matched_tokens.side_effect = ( + _make_get_num_new_matched_tokens(req_num_new_matched_tokens, True) + ) + recompute_scheduler.connector.request_finished.return_value = (False, None) + recompute_scheduler.connector.take_events.return_value = () + + scheduler_output = recompute_scheduler.schedule() + + # request should be waiting for remote KVs + assert len(recompute_scheduler.waiting) == 1 + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.num_computed_tokens == 0 + + # get the allocated block IDs + (req_block_ids,) = recompute_scheduler.kv_cache_manager.get_block_ids( + request.request_id + ) + invalid_block_id = req_block_ids[invalid_block_idx] + invalid_block_ids = {invalid_block_id} + + # get the block object to verify it's not cached yet and stays uncached + block = recompute_scheduler.kv_cache_manager.block_pool.blocks[invalid_block_id] + + # verify block has no hash before invalid blocks are reported + assert block.block_hash is None, ( + "Async loading blocks should not be cached yet (no hash)" + ) + + # report invalid blocks (transfer not finished yet) + model_runner_output = create_model_runner_output( + reqs=[], + finished_recving=None, # transfer NOT finished + invalid_block_ids=invalid_block_ids, + use_eos=False, + ) + + # critical: spy on evict_blocks to verify it's NOT called for async blocks + original_evict_blocks = recompute_scheduler.kv_cache_manager.evict_blocks + evict_blocks_calls = [] + + def evict_blocks_spy(block_ids): + evict_blocks_calls.append(set(block_ids)) + return original_evict_blocks(block_ids) + + with patch.object( + recompute_scheduler.kv_cache_manager, "evict_blocks", evict_blocks_spy + ): + recompute_scheduler.update_from_output(scheduler_output, model_runner_output) + + # verify evict_blocks was NOT called (async blocks excluded from eviction) + assert len(evict_blocks_calls) == 0, ( + f"evict_blocks should not be called for async-only invalid blocks, " + f"but was called {len(evict_blocks_calls)} time(s) with {evict_blocks_calls}" + ) + + # request should still be waiting (not finished with error due to recompute policy) + assert request.status == RequestStatus.WAITING_FOR_REMOTE_KVS + assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids + + # verify num_computed_tokens was truncated to before invalid block + expected_valid_tokens = invalid_block_idx * recompute_scheduler.block_size + assert request.num_computed_tokens == expected_valid_tokens + + # verify invalid block still has no hash (was not evicted) + assert block.block_hash is None, ( + f"Async loading blocks shouldn't be cached or evicted. " + f"Block {invalid_block_id} hash should be None but is {block.block_hash}" + ) + + # now simulate async transfer completing + model_runner_output_2 = create_model_runner_output( + reqs=[], + finished_recving={request.request_id}, + invalid_block_ids=None, + use_eos=False, + ) + + recompute_scheduler.update_from_output(scheduler_output, model_runner_output_2) + + # verify request is now marked as finished receiving and ready to be processed + assert request.request_id in recompute_scheduler.finished_recving_kv_req_ids + assert request.request_id in recompute_scheduler.failed_recving_kv_req_ids + + # critical: verify invalid block still has no hash before recompute + # the async transfer invalid data was never cached + assert block.block_hash is None, ( + f"Invalid block {invalid_block_id} should not be cached before recompute " + f"(hash should be None), but hash is {block.block_hash}" + ) + + # critical end-to-end test: spy on cache_blocks to verify it's called with + # the truncated num_computed_tokens value + original_cache_blocks = recompute_scheduler.kv_cache_manager.cache_blocks + cache_blocks_calls = [] + + def cache_blocks_spy(req, num_tokens): + cache_blocks_calls.append((req.request_id, num_tokens)) + return original_cache_blocks(req, num_tokens) + + with patch.object( + recompute_scheduler.kv_cache_manager, "cache_blocks", cache_blocks_spy + ): + # call schedule() again - this triggers _update_waiting_for_remote_kv() + # which should call cache_blocks with the truncated value + recompute_scheduler.schedule() + + # verify cache_blocks was called with the truncated value + assert len(cache_blocks_calls) == 1, ( + f"cache_blocks should be called exactly once, " + f"got {len(cache_blocks_calls)} calls" + ) + cached_req_id, cached_num_tokens = cache_blocks_calls[0] + assert cached_req_id == request.request_id + assert cached_num_tokens == expected_valid_tokens, ( + f"cache_blocks should be called with truncated value {expected_valid_tokens}, " + f"but was called with {cached_num_tokens}" + ) + + # request should now be RUNNING (scheduled immediately after transfer completes) + # the flow is: WAITING_FOR_REMOTE_KVS -> WAITING -> RUNNING in same schedule() call + assert request.status == RequestStatus.RUNNING + + # num_computed_tokens should be >= expected_valid_tokens because the scheduler + # will schedule additional new tokens (up to max_num_batched_tokens) for the request + assert request.num_computed_tokens >= expected_valid_tokens, ( + f"num_computed_tokens should be at least {expected_valid_tokens}, " + f"got {request.num_computed_tokens}" + ) + + # request should no longer be in the failed/finished receiving sets + assert request.request_id not in recompute_scheduler.failed_recving_kv_req_ids + assert request.request_id not in recompute_scheduler.finished_recving_kv_req_ids + + # request should be in the running queue + assert request in recompute_scheduler.running diff --git a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py index d0a6eeae6286d..4ba6b2201d0e2 100644 --- a/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py +++ b/tests/v1/kv_connector/unit/test_kv_connector_lifecyle.py @@ -1,8 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa: E501 - SharedStorageConnectorMetadata, +from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import ( # noqa: E501 + ExampleConnectorMetadata, ) from vllm.distributed.kv_transfer.kv_transfer_state import ( ensure_kv_transfer_initialized, @@ -11,7 +11,7 @@ from vllm.distributed.kv_transfer.kv_transfer_state import ( from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin -# Importing utils registers TestSharedStorageConnector with the factory +# Importing utils registers TestExampleConnector with the factory from .utils import create_vllm_config @@ -26,13 +26,13 @@ def _make_empty_scheduler_output(): num_common_prefix_blocks=[], finished_req_ids=set(), free_encoder_mm_hashes=[], - kv_connector_metadata=SharedStorageConnectorMetadata(), + kv_connector_metadata=ExampleConnectorMetadata(), ) def test_kv_connector_mixin_clears_metadata(): vllm_config = create_vllm_config() - vllm_config.kv_transfer_config.kv_connector = "TestSharedStorageConnector" + vllm_config.kv_transfer_config.kv_connector = "TestExampleConnector" vllm_config.kv_transfer_config.kv_role = "kv_both" vllm_config.kv_transfer_config.kv_connector_extra_config["name"] = "unit" diff --git a/tests/v1/kv_connector/unit/test_lmcache_connector.py b/tests/v1/kv_connector/unit/test_lmcache_connector.py new file mode 100644 index 0000000000000..6a8cfc71a67a6 --- /dev/null +++ b/tests/v1/kv_connector/unit/test_lmcache_connector.py @@ -0,0 +1,756 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from unittest.mock import MagicMock + +import pytest + +from vllm.distributed.kv_events import BlockStored +from vllm.distributed.kv_transfer.kv_connector.v1.lmcache_connector import ( + LMCacheConnectorV1, + LMCacheKVEvents, +) +from vllm.v1.outputs import KVConnectorOutput + + +@pytest.fixture +def mock_lmcache_engine_event(): + """Create a mock event object that mimics what the lmcache engine returns.""" + + class MockEvent: + def __init__( + self, + block_hashes, + parent_block_hash, + token_ids, + lora_id, + block_size, + medium, + ): + self.block_hashes = block_hashes + self.parent_block_hash = parent_block_hash + self.token_ids = token_ids + self.lora_id = lora_id + self.block_size = block_size + self.medium = medium + + return MockEvent( + block_hashes=["hash1", "hash2"], + parent_block_hash="parent_hash", + token_ids=[1, 2, 3, 4], + lora_id=None, + block_size=16, + medium="GPU", + ) + + +@pytest.fixture +def mock_connector(): + """Create a mock LMCacheConnectorV1 instance with mocked dependencies.""" + connector = MagicMock(spec=LMCacheConnectorV1) + connector._kv_cache_events = None + connector._lmcache_engine = MagicMock() + + # Make the methods use the real implementation + connector.get_kv_connector_kv_cache_events = ( + LMCacheConnectorV1.get_kv_connector_kv_cache_events.__get__( + connector, LMCacheConnectorV1 + ) + ) + connector.update_connector_output = ( + LMCacheConnectorV1.update_connector_output.__get__( + connector, LMCacheConnectorV1 + ) + ) + connector.take_events = LMCacheConnectorV1.take_events.__get__( + connector, LMCacheConnectorV1 + ) + + return connector + + +class TestGetKVConnectorKVCacheEvents: + """Test get_kv_connector_kv_cache_events method.""" + + def test_returns_none_when_no_events(self, mock_connector): + """Test that None is returned when lmcache engine has no events.""" + mock_connector._lmcache_engine.get_kv_events.return_value = None + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is None + mock_connector._lmcache_engine.get_kv_events.assert_called_once() + + def test_returns_none_when_empty_list(self, mock_connector): + """Test that None is returned when lmcache engine returns empty list.""" + mock_connector._lmcache_engine.get_kv_events.return_value = [] + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is None + + def test_converts_single_event(self, mock_connector, mock_lmcache_engine_event): + """Test conversion of a single event from lmcache engine format.""" + mock_connector._lmcache_engine.get_kv_events.return_value = [ + mock_lmcache_engine_event + ] + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is not None + assert isinstance(result, LMCacheKVEvents) + assert result.get_number_of_workers() == 1 + + events = result.get_all_events() + assert len(events) == 1 + assert isinstance(events[0], BlockStored) + assert events[0].block_hashes == ["hash1", "hash2"] + assert events[0].parent_block_hash == "parent_hash" + assert events[0].token_ids == [1, 2, 3, 4] + assert events[0].lora_id is None + assert events[0].block_size == 16 + assert events[0].medium == "GPU" + + def test_converts_multiple_events(self, mock_connector): + """Test conversion of multiple events from lmcache engine format.""" + + class MockEvent: + def __init__(self, i): + self.block_hashes = [f"hash{i}"] + self.parent_block_hash = f"parent{i}" + self.token_ids = [i] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + events = [MockEvent(i) for i in range(5)] + mock_connector._lmcache_engine.get_kv_events.return_value = events + + result = mock_connector.get_kv_connector_kv_cache_events() + + assert result is not None + assert isinstance(result, LMCacheKVEvents) + + converted_events = result.get_all_events() + assert len(converted_events) == 5 + + for i, event in enumerate(converted_events): + assert isinstance(event, BlockStored) + assert event.block_hashes == [f"hash{i}"] + assert event.parent_block_hash == f"parent{i}" + assert event.token_ids == [i] + + def test_preserves_event_attributes(self, mock_connector): + """Test that all event attributes are correctly preserved.""" + + class MockEventWithLora: + def __init__(self): + self.block_hashes = ["hash_a", "hash_b", "hash_c"] + self.parent_block_hash = "parent_xyz" + self.token_ids = [100, 200, 300] + self.lora_id = 42 + self.block_size = 32 + self.medium = "DISK" + + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEventWithLora() + ] + + result = mock_connector.get_kv_connector_kv_cache_events() + + events = result.get_all_events() + event = events[0] + + assert event.block_hashes == ["hash_a", "hash_b", "hash_c"] + assert event.parent_block_hash == "parent_xyz" + assert event.token_ids == [100, 200, 300] + assert event.lora_id == 42 + assert event.block_size == 32 + assert event.medium == "DISK" + + def test_handles_none_parent_block_hash(self, mock_connector): + """Test handling of events with None parent_block_hash.""" + + class MockEventNoParent: + def __init__(self): + self.block_hashes = ["hash1"] + self.parent_block_hash = None + self.token_ids = [1, 2] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEventNoParent() + ] + + result = mock_connector.get_kv_connector_kv_cache_events() + + events = result.get_all_events() + assert events[0].parent_block_hash is None + + +class TestUpdateConnectorOutput: + """Test update_connector_output method.""" + + def test_does_nothing_when_kv_cache_events_is_none(self, mock_connector): + """Test that method returns early when kv_cache_events is None.""" + connector_output = KVConnectorOutput(kv_cache_events=None) + + mock_connector.update_connector_output(connector_output) + + assert mock_connector._kv_cache_events is None + + def test_does_nothing_when_kv_cache_events_is_not_lmcache_kv_events( + self, mock_connector + ): + """Test that method returns early when kv_cache_events is not + LMCacheKVEvents.""" + # Create a mock object that is not LMCacheKVEvents + fake_events = MagicMock() + connector_output = KVConnectorOutput(kv_cache_events=fake_events) + + mock_connector.update_connector_output(connector_output) + + assert mock_connector._kv_cache_events is None + + def test_sets_kv_cache_events_when_none(self, mock_connector): + """Test that _kv_cache_events is set when it was None.""" + kv_events = LMCacheKVEvents(num_workers=1) + event = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1, 2], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events.add_events([event]) + + connector_output = KVConnectorOutput(kv_cache_events=kv_events) + + mock_connector.update_connector_output(connector_output) + + assert mock_connector._kv_cache_events is kv_events + + def test_adds_events_when_kv_cache_events_already_exists(self, mock_connector): + """Test that events are added when _kv_cache_events already exists.""" + # Set up existing events + existing_events = LMCacheKVEvents(num_workers=2) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + existing_events.add_events([event1]) + existing_events.add_events([event1]) # Simulate 2 workers reporting + + mock_connector._kv_cache_events = existing_events + + # Create new events to add + new_events = LMCacheKVEvents(num_workers=1) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + new_events.add_events([event2]) + + connector_output = KVConnectorOutput(kv_cache_events=new_events) + + mock_connector.update_connector_output(connector_output) + + # Check that events were added + all_events = mock_connector._kv_cache_events.get_all_events() + assert len(all_events) == 3 # 2 from existing + 1 from new + assert event1 in all_events + assert event2 in all_events + + def test_increments_workers_when_kv_cache_events_already_exists( + self, mock_connector + ): + """Test that worker count is incremented correctly.""" + # Set up existing events with 2 workers + existing_events = LMCacheKVEvents(num_workers=2) + mock_connector._kv_cache_events = existing_events + + # Create new events from 3 workers + new_events = LMCacheKVEvents(num_workers=3) + event = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + new_events.add_events([event]) + + connector_output = KVConnectorOutput(kv_cache_events=new_events) + + mock_connector.update_connector_output(connector_output) + + # Worker count should be 2 + 3 = 5 + assert mock_connector._kv_cache_events.get_number_of_workers() == 5 + + def test_multiple_updates(self, mock_connector): + """Test multiple consecutive updates.""" + # First update + events1 = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + events1.add_events([event1]) + output1 = KVConnectorOutput(kv_cache_events=events1) + mock_connector.update_connector_output(output1) + + # Second update + events2 = LMCacheKVEvents(num_workers=2) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + events2.add_events([event2]) + output2 = KVConnectorOutput(kv_cache_events=events2) + mock_connector.update_connector_output(output2) + + # Third update + events3 = LMCacheKVEvents(num_workers=1) + event3 = BlockStored( + block_hashes=["hash3"], + parent_block_hash=None, + token_ids=[3], + block_size=16, + lora_id=None, + medium="GPU", + ) + events3.add_events([event3]) + output3 = KVConnectorOutput(kv_cache_events=events3) + mock_connector.update_connector_output(output3) + + # Check final state + all_events = mock_connector._kv_cache_events.get_all_events() + assert len(all_events) == 3 + assert mock_connector._kv_cache_events.get_number_of_workers() == 4 # 1+2+1 + + def test_updates_with_empty_events(self, mock_connector): + """Test updating with empty event lists.""" + # First update with actual events + events1 = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + events1.add_events([event1]) + output1 = KVConnectorOutput(kv_cache_events=events1) + mock_connector.update_connector_output(output1) + + # Second update with empty events + events2 = LMCacheKVEvents(num_workers=2) + # No events added + output2 = KVConnectorOutput(kv_cache_events=events2) + mock_connector.update_connector_output(output2) + + # Should still have the original event + all_events = mock_connector._kv_cache_events.get_all_events() + assert len(all_events) == 1 + assert mock_connector._kv_cache_events.get_number_of_workers() == 3 + + +class TestTakeEvents: + """Test take_events method.""" + + def test_yields_nothing_when_kv_cache_events_is_none(self, mock_connector): + """Test that nothing is yielded when _kv_cache_events is None.""" + mock_connector._kv_cache_events = None + + events = list(mock_connector.take_events()) + + assert events == [] + + def test_yields_events_and_clears(self, mock_connector): + """Test that events are yielded and then cleared.""" + # Set up events + kv_events = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events.add_events([event1, event2]) + mock_connector._kv_cache_events = kv_events + + # Take events + events = list(mock_connector.take_events()) + + # Check that events were yielded + assert len(events) == 2 + assert event1 in events + assert event2 in events + + # Check that _kv_cache_events was cleared + assert mock_connector._kv_cache_events is None + + def test_aggregates_before_yielding(self, mock_connector): + """Test that events are aggregated before yielding.""" + # Set up events from multiple workers + kv_events = LMCacheKVEvents(num_workers=3) + common_event = BlockStored( + block_hashes=["hash_common"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + uncommon_event = BlockStored( + block_hashes=["hash_uncommon"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + + # All 3 workers report common_event + kv_events.add_events([common_event]) + kv_events.add_events([common_event]) + kv_events.add_events([common_event]) + + # Only 1 worker reports uncommon_event + kv_events.add_events([uncommon_event]) + + mock_connector._kv_cache_events = kv_events + + # Take events + events = list(mock_connector.take_events()) + + # Only the common event should be yielded + assert len(events) == 1 + assert events[0] == common_event + + def test_multiple_take_events_calls(self, mock_connector): + """Test calling take_events multiple times.""" + # First call with events + kv_events1 = LMCacheKVEvents(num_workers=1) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events1.add_events([event1]) + mock_connector._kv_cache_events = kv_events1 + + events1 = list(mock_connector.take_events()) + assert len(events1) == 1 + assert events1[0] == event1 + assert mock_connector._kv_cache_events is None + + # Second call with no events + events2 = list(mock_connector.take_events()) + assert events2 == [] + + # Third call after adding new events + kv_events2 = LMCacheKVEvents(num_workers=1) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + kv_events2.add_events([event2]) + mock_connector._kv_cache_events = kv_events2 + + events3 = list(mock_connector.take_events()) + assert len(events3) == 1 + assert events3[0] == event2 + + def test_yields_empty_after_aggregation_removes_all(self, mock_connector): + """Test that nothing is yielded if aggregation removes all events.""" + # Set up events from 2 workers with no common events + kv_events = LMCacheKVEvents(num_workers=2) + event1 = BlockStored( + block_hashes=["hash1"], + parent_block_hash=None, + token_ids=[1], + block_size=16, + lora_id=None, + medium="GPU", + ) + event2 = BlockStored( + block_hashes=["hash2"], + parent_block_hash=None, + token_ids=[2], + block_size=16, + lora_id=None, + medium="GPU", + ) + + # Worker 1 reports event1 + kv_events.add_events([event1]) + # Worker 2 reports event2 + kv_events.add_events([event2]) + + mock_connector._kv_cache_events = kv_events + + # Take events + events = list(mock_connector.take_events()) + + # No common events, so nothing should be yielded + assert events == [] + assert mock_connector._kv_cache_events is None + + +class TestIntegrationScenarios: + """Test integration scenarios.""" + + def test_full_workflow(self, mock_connector, mock_lmcache_engine_event): + """Test a complete workflow from getting events to taking them.""" + # Step 1: Get events from lmcache engine + mock_connector._lmcache_engine.get_kv_events.return_value = [ + mock_lmcache_engine_event + ] + kv_events = mock_connector.get_kv_connector_kv_cache_events() + + assert kv_events is not None + assert len(kv_events.get_all_events()) == 1 + + # Step 2: Update connector output (simulate receiving from worker) + output1 = KVConnectorOutput(kv_cache_events=kv_events) + mock_connector.update_connector_output(output1) + + assert mock_connector._kv_cache_events is not None + + # Step 3: Take events + taken_events = list(mock_connector.take_events()) + + assert len(taken_events) == 1 + assert mock_connector._kv_cache_events is None + + def test_multiple_workers_workflow(self, mock_connector): + """Test workflow with multiple workers.""" + + class MockEvent: + def __init__(self, hash_val): + self.block_hashes = [hash_val] + self.parent_block_hash = None + self.token_ids = [1] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + # Worker 1 + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEvent("hash_common"), + MockEvent("hash_worker1"), + ] + kv_events1 = mock_connector.get_kv_connector_kv_cache_events() + output1 = KVConnectorOutput(kv_cache_events=kv_events1) + mock_connector.update_connector_output(output1) + + # Worker 2 + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEvent("hash_common"), + MockEvent("hash_worker2"), + ] + kv_events2 = mock_connector.get_kv_connector_kv_cache_events() + output2 = KVConnectorOutput(kv_cache_events=kv_events2) + mock_connector.update_connector_output(output2) + + # Take events (should only get common events) + taken_events = list(mock_connector.take_events()) + + # With aggregation, only events reported by both workers should be present + # In this case, hash_common was reported by both + event_hashes = [e.block_hashes[0] for e in taken_events] + assert "hash_common" in event_hashes + + def test_empty_workflow(self, mock_connector): + """Test workflow when there are no events at any stage.""" + # Get events returns None + mock_connector._lmcache_engine.get_kv_events.return_value = None + kv_events = mock_connector.get_kv_connector_kv_cache_events() + + assert kv_events is None + + # Update with None + output = KVConnectorOutput(kv_cache_events=None) + mock_connector.update_connector_output(output) + + # Take events + taken_events = list(mock_connector.take_events()) + + assert taken_events == [] + assert mock_connector._kv_cache_events is None + + def test_repeated_cycles(self, mock_connector): + """Test multiple cycles of the complete workflow.""" + + class MockEvent: + def __init__(self, cycle_num): + self.block_hashes = [f"hash_cycle_{cycle_num}"] + self.parent_block_hash = None + self.token_ids = [cycle_num] + self.lora_id = None + self.block_size = 16 + self.medium = "GPU" + + for cycle in range(3): + # Get events + mock_connector._lmcache_engine.get_kv_events.return_value = [ + MockEvent(cycle) + ] + kv_events = mock_connector.get_kv_connector_kv_cache_events() + + # Update + output = KVConnectorOutput(kv_cache_events=kv_events) + mock_connector.update_connector_output(output) + + # Take + taken_events = list(mock_connector.take_events()) + + # Verify + assert len(taken_events) == 1 + assert taken_events[0].block_hashes[0] == f"hash_cycle_{cycle}" + assert mock_connector._kv_cache_events is None + + def test_lmcache_kv_events_aggregation(self): + """ + Test LMCacheKVEvents aggregation across TP ranks using + KVOutputAggregator (used by MultiprocExecutor). + """ + from vllm.distributed.kv_transfer.kv_connector.utils import KVOutputAggregator + from vllm.v1.outputs import ModelRunnerOutput + + # Create KVOutputAggregator for 3 workers (simulating TP=3) + aggregator = KVOutputAggregator(expected_finished_count=3) + + # Define common and unique events + common_event = BlockStored( + block_hashes=["hash_common"], + parent_block_hash="parent_common", + token_ids=[1, 2, 3], + block_size=16, + lora_id=None, + medium="GPU", + ) + + worker1_unique_event = BlockStored( + block_hashes=["hash_worker1"], + parent_block_hash="parent_w1", + token_ids=[4, 5], + block_size=16, + lora_id=None, + medium="GPU", + ) + + worker2_unique_event = BlockStored( + block_hashes=["hash_worker2"], + parent_block_hash="parent_w2", + token_ids=[6, 7], + block_size=16, + lora_id=None, + medium="GPU", + ) + + worker3_unique_event = BlockStored( + block_hashes=["hash_worker3"], + parent_block_hash="parent_w3", + token_ids=[8, 9], + block_size=16, + lora_id=None, + medium="GPU", + ) + + # Create events for each worker + # Worker 0: reports common event and its unique event + worker0_events = LMCacheKVEvents(num_workers=1) + worker0_events.add_events([common_event, worker1_unique_event]) + + # Worker 1: reports common event and its unique event + worker1_events = LMCacheKVEvents(num_workers=1) + worker1_events.add_events([common_event, worker2_unique_event]) + + # Worker 2: reports common event and its unique event + worker2_events = LMCacheKVEvents(num_workers=1) + worker2_events.add_events([common_event, worker3_unique_event]) + + # Create ModelRunnerOutput instances for each worker + worker_outputs = [] + for i, worker_events in enumerate( + [worker0_events, worker1_events, worker2_events] + ): + output = ModelRunnerOutput( + req_ids=[f"req_{i}"], + req_id_to_index={f"req_{i}": 0}, + sampled_token_ids=[[123]], # dummy token + logprobs=None, + prompt_logprobs_dict={}, + pooler_output=[None], + kv_connector_output=KVConnectorOutput( + finished_sending=set([f"req_{i}_send"]) + if i < 2 + else None, # Workers 0,1 finished sending + finished_recving=set([f"req_{i}_recv"]) + if i > 0 + else None, # Workers 1,2 finished receiving + kv_cache_events=worker_events, + ), + ) + worker_outputs.append(output) + + # Use the real aggregation mechanism (like MultiprocExecutor.execute_model) + aggregated_output = aggregator.aggregate(worker_outputs, output_rank=0) + kv_cache_events = aggregated_output.kv_connector_output.kv_cache_events + + assert isinstance(kv_cache_events, LMCacheKVEvents) + + # After aggregation, events should be combined from all workers + # The aggregator doesn't automatically aggregate events, so we need to call + # aggregate() to get only common events + kv_cache_events.aggregate() + aggregated_events = kv_cache_events.get_all_events() + + # Only the common event should remain after aggregation + # because it's the only event reported by all 3 workers + assert len(aggregated_events) == 1 + assert aggregated_events[0] == common_event + + # Verify the common event properties + assert aggregated_events[0].block_hashes == ["hash_common"] + assert aggregated_events[0].parent_block_hash == "parent_common" + assert aggregated_events[0].token_ids == [1, 2, 3] diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py index ffa7d884d2762..9b6d52e7c294d 100644 --- a/tests/v1/kv_connector/unit/test_multi_connector.py +++ b/tests/v1/kv_connector/unit/test_multi_connector.py @@ -77,9 +77,9 @@ def _compare_directories(dir1: Path, dir2: Path) -> bool: "https://github.com/ROCm/pytorch/issues/2822" ), ) -def test_multi_shared_storage_connector_consistency(): +def test_multi_example_connector_consistency(): """ - Tests that MultiConnector with two SharedStorageConnectors saves + Tests that MultiConnector with two ExampleConnectors saves identical KV cache data to separate storage locations. """ storage_1_path = Path("storage_1/") @@ -89,14 +89,14 @@ def test_multi_shared_storage_connector_consistency(): storage_1_path.mkdir() storage_2_path.mkdir() - # Configure MultiConnector with two SharedStorageConnectors + # Configure MultiConnector with two ExampleConnectors kv_transfer_config = KVTransferConfig( kv_connector="MultiConnector", kv_role="kv_both", kv_connector_extra_config={ "connectors": [ { - "kv_connector": "TestSharedStorageConnector", + "kv_connector": "TestExampleConnector", "kv_role": "kv_both", "kv_connector_extra_config": { "shared_storage_path": str(storage_1_path), @@ -105,7 +105,7 @@ def test_multi_shared_storage_connector_consistency(): "kv_connector_module_path": "tests.v1.kv_connector.unit.utils", }, { - "kv_connector": "TestSharedStorageConnector", + "kv_connector": "TestExampleConnector", "kv_role": "kv_both", "kv_connector_extra_config": { "shared_storage_path": str(storage_2_path), @@ -427,7 +427,7 @@ class TestMultiConnectorStats: def test_build_kv_connector_stats_skips_connectors_without_custom_stats(self): """Test that connectors without custom stats (return None) are skipped.""" - # SharedStorageConnector doesn't override build_kv_connector_stats, + # ExampleConnector doesn't override build_kv_connector_stats, # so it returns None and should be skipped serialized_data = { "NixlConnector": { @@ -440,7 +440,7 @@ class TestMultiConnectorStats: "num_failed_notifications": [], } }, - "SharedStorageConnector": {"data": {"some_field": [1, 2, 3]}}, + "ExampleConnector": {"data": {"some_field": [1, 2, 3]}}, } stats = MultiConnector.build_kv_connector_stats(data=serialized_data) @@ -451,8 +451,8 @@ class TestMultiConnectorStats: assert len(stats.data) == 1 assert "NixlConnector" in stats.data assert isinstance(stats.data["NixlConnector"], NixlKVConnectorStats) - # SharedStorageConnector should be skipped (returns None) - assert "SharedStorageConnector" not in stats.data + # ExampleConnector should be skipped (returns None) + assert "ExampleConnector" not in stats.data def test_build_kv_connector_stats_handles_malformed_data(self): """Test that malformed data raises appropriate errors.""" @@ -527,13 +527,13 @@ class TestMultiConnectorStats: ) stats2 = MultiKVConnectorStats( - data={"SharedStorageConnector": KVConnectorStats(data={"field": [1, 2]})} + data={"ExampleConnector": KVConnectorStats(data={"field": [1, 2]})} ) result = stats1.aggregate(stats2) assert "NixlConnector" in result.data - assert "SharedStorageConnector" in result.data + assert "ExampleConnector" in result.data def test_reduce(self): """Test that reduce() correctly reduces all nested connector stats.""" diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 58f1a7282352b..5cdb1f84b30d4 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -24,8 +24,8 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorMetadata, KVConnectorRole, ) -from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import ( # noqa - SharedStorageConnector, +from vllm.distributed.kv_transfer.kv_connector.v1.example_connector import ( # noqa + ExampleConnector, ) from vllm.utils.hashing import sha256 from vllm.v1.core.kv_cache_manager import KVCacheBlocks @@ -264,10 +264,10 @@ def create_model_runner_output( ) -class TestSharedStorageConnector(SharedStorageConnector): +class TestExampleConnector(ExampleConnector): def __init__(self, config: VllmConfig, role, kv_cache_config): self.name = config.kv_transfer_config.kv_connector_extra_config["name"] - self._connector = SharedStorageConnector(config, role) + self._connector = ExampleConnector(config, role) self.call_record: dict[str, int] = defaultdict(int) # Use a unique temp file per connector self._event_file = ( @@ -394,7 +394,7 @@ class MockKVConnector(KVConnectorBase_V1): KVConnectorFactory.register_connector( - "TestSharedStorageConnector", __name__, TestSharedStorageConnector.__name__ + "TestExampleConnector", __name__, TestExampleConnector.__name__ ) KVConnectorFactory.register_connector( diff --git a/tests/v1/metrics/test_stats.py b/tests/v1/metrics/test_stats.py index 48067def8357e..7d902bbc6fc24 100644 --- a/tests/v1/metrics/test_stats.py +++ b/tests/v1/metrics/test_stats.py @@ -1,8 +1,109 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from vllm.v1.metrics.stats import IterationStats +from vllm.v1.engine import FinishReason +from vllm.v1.metrics.stats import IterationStats, RequestStateStats def test_iteration_stats_repr(): iteration_stats = IterationStats() assert repr(iteration_stats).startswith("IterationStats(") + + +def test_prefill_kv_computed_with_cache(): + """Test that prefill KV compute correctly excludes cached tokens.""" + iteration_stats = IterationStats() + req_stats = RequestStateStats(arrival_time=0.0) + req_stats.scheduled_ts = 0.1 + req_stats.first_token_ts = 0.5 + req_stats.last_token_ts = 5.0 + req_stats.num_generation_tokens = 50 + + # Case 1: With prefix cache (1200 tokens cached) + iteration_stats.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=10000, + max_tokens_param=100, + req_stats=req_stats, + num_cached_tokens=1200, + ) + + finished_req = iteration_stats.finished_requests[0] + assert finished_req.num_prompt_tokens == 10000 + assert finished_req.num_cached_tokens == 1200 + + # Verify calculation: prefill KV = prompt tokens - cached tokens + prefill_kv_computed = finished_req.num_prompt_tokens - max( + finished_req.num_cached_tokens, 0 + ) + assert prefill_kv_computed == 8800 # 10000 - 1200 + + +def test_prefill_kv_computed_no_cache(): + """Test prefill KV compute without prefix caching.""" + iteration_stats = IterationStats() + req_stats = RequestStateStats(arrival_time=0.0) + req_stats.scheduled_ts = 0.1 + req_stats.first_token_ts = 0.5 + req_stats.last_token_ts = 2.0 + req_stats.num_generation_tokens = 10 + + # Case 2: No prefix cache + iteration_stats.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=2000, + max_tokens_param=100, + req_stats=req_stats, + num_cached_tokens=0, + ) + + finished_req = iteration_stats.finished_requests[0] + assert finished_req.num_prompt_tokens == 2000 + assert finished_req.num_cached_tokens == 0 + + # Verify calculation: prefill KV = full prompt when no cache + prefill_kv_computed = finished_req.num_prompt_tokens - max( + finished_req.num_cached_tokens, 0 + ) + assert prefill_kv_computed == 2000 + + +def test_prefill_kv_computed_edge_cases(): + """Test edge cases for prefill KV compute calculation.""" + iteration_stats = IterationStats() + req_stats = RequestStateStats(arrival_time=0.0) + req_stats.scheduled_ts = 0.1 + req_stats.first_token_ts = 0.5 + req_stats.last_token_ts = 1.0 + req_stats.num_generation_tokens = 1 + + # Case 3: Negative num_cached_tokens (shouldn't happen, but handle gracefully) + iteration_stats.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=100, + max_tokens_param=10, + req_stats=req_stats, + num_cached_tokens=-1, + ) + + finished_req = iteration_stats.finished_requests[0] + # max() should handle negative values + prefill_kv_computed = finished_req.num_prompt_tokens - max( + finished_req.num_cached_tokens, 0 + ) + assert prefill_kv_computed == 100 # Should treat negative as 0 + + # Case 4: All tokens cached (shouldn't happen in practice) + iteration_stats2 = IterationStats() + iteration_stats2.update_from_finished_request( + finish_reason=FinishReason.STOP, + num_prompt_tokens=100, + max_tokens_param=10, + req_stats=req_stats, + num_cached_tokens=100, + ) + + finished_req2 = iteration_stats2.finished_requests[0] + prefill_kv_computed2 = finished_req2.num_prompt_tokens - max( + finished_req2.num_cached_tokens, 0 + ) + assert prefill_kv_computed2 == 0 # All cached, nothing computed diff --git a/tests/v1/spec_decode/test_max_len.py b/tests/v1/spec_decode/test_max_len.py index 81da8609aa6cf..15a6bd2659ea9 100644 --- a/tests/v1/spec_decode/test_max_len.py +++ b/tests/v1/spec_decode/test_max_len.py @@ -82,7 +82,7 @@ def test_eagle_max_len( len(o.prompt_token_ids) < 80 < len(o.prompt_token_ids) + len(o.outputs[0].token_ids) - < 200 + <= 200 ), ( "This test is only meaningful if the output " "is longer than the eagle max length" diff --git a/tests/v1/spec_decode/test_tree_attention.py b/tests/v1/spec_decode/test_tree_attention.py index a4ee53008ce82..0afeeb8914b87 100644 --- a/tests/v1/spec_decode/test_tree_attention.py +++ b/tests/v1/spec_decode/test_tree_attention.py @@ -88,8 +88,8 @@ def forward_attention( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc.cpu(), seq_lens=seq_lens, - seq_lens_cpu=seq_lens.cpu(), - num_computed_tokens_cpu=context_lens.cpu(), + _seq_lens_cpu=seq_lens.cpu(), + _num_computed_tokens_cpu=context_lens.cpu(), num_reqs=batch_size, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 70047a993c3f9..ba52af3ad604d 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -70,6 +70,7 @@ class TestReasoningStructuredOutput: request.use_structured_output = True request.prompt_token_ids = [1, 2, 3, 4, 5] request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8] + request.num_computed_tokens = 5 return request def test_should_fill_bitmask_with_enable_in_reasoning( diff --git a/tests/v1/worker/test_gpu_profiler.py b/tests/v1/worker/test_gpu_profiler.py index f7255fae05a4e..933ea42f18cd5 100644 --- a/tests/v1/worker/test_gpu_profiler.py +++ b/tests/v1/worker/test_gpu_profiler.py @@ -2,8 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -import vllm.envs as envs -from vllm.profiler.gpu_profiler import WorkerProfiler +from vllm.config import ProfilerConfig +from vllm.profiler.wrapper import WorkerProfiler class ConcreteWorkerProfiler(WorkerProfiler): @@ -11,11 +11,11 @@ class ConcreteWorkerProfiler(WorkerProfiler): A basic implementation of a worker profiler for testing purposes. """ - def __init__(self): + def __init__(self, profiler_config: ProfilerConfig): self.start_call_count = 0 self.stop_call_count = 0 self.should_fail_start = False - super().__init__() + super().__init__(profiler_config) def _start(self) -> None: if self.should_fail_start: @@ -26,17 +26,19 @@ class ConcreteWorkerProfiler(WorkerProfiler): self.stop_call_count += 1 -@pytest.fixture(autouse=True) -def reset_mocks(): - """Fixture to reset mocks and env variables before each test.""" - envs.VLLM_PROFILER_DELAY_ITERS = 0 - envs.VLLM_PROFILER_MAX_ITERS = 0 +@pytest.fixture +def default_profiler_config(): + return ProfilerConfig( + profiler="torch", + torch_profiler_dir="/tmp/mock", + delay_iterations=0, + max_iterations=0, + ) -def test_immediate_start_stop(): +def test_immediate_start_stop(default_profiler_config): """Test standard start without delay.""" - profiler = ConcreteWorkerProfiler() - + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() assert profiler._running is True assert profiler._active is True @@ -48,10 +50,10 @@ def test_immediate_start_stop(): assert profiler.stop_call_count == 1 -def test_delayed_start(): +def test_delayed_start(default_profiler_config): """Test that profiler waits for N steps before actually starting.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) # User requests start profiler.start() @@ -71,10 +73,10 @@ def test_delayed_start(): assert profiler.start_call_count == 1 -def test_max_iterations(): +def test_max_iterations(default_profiler_config): """Test that profiler stops automatically after max iterations.""" - envs.VLLM_PROFILER_MAX_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.max_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() assert profiler._running is True @@ -95,12 +97,11 @@ def test_max_iterations(): assert profiler.stop_call_count == 1 -def test_delayed_start_and_max_iters(): +def test_delayed_start_and_max_iters(default_profiler_config): """Test combined delayed start and max iterations.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - envs.VLLM_PROFILER_MAX_ITERS = 2 - profiler = ConcreteWorkerProfiler() - + default_profiler_config.delay_iterations = 2 + default_profiler_config.max_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() # Step 1 @@ -127,9 +128,9 @@ def test_delayed_start_and_max_iters(): assert profiler.stop_call_count == 1 -def test_idempotency(): +def test_idempotency(default_profiler_config): """Test that calling start/stop multiple times doesn't break logic.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) # Double Start profiler.start() @@ -142,10 +143,10 @@ def test_idempotency(): assert profiler.stop_call_count == 1 # Should only stop once -def test_step_inactive(): +def test_step_inactive(default_profiler_config): """Test that stepping while inactive does nothing.""" - envs.VLLM_PROFILER_DELAY_ITERS = 2 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 2 + profiler = ConcreteWorkerProfiler(default_profiler_config) # Not started yet profiler.step() @@ -155,9 +156,9 @@ def test_step_inactive(): assert profiler.start_call_count == 0 -def test_start_failure(): +def test_start_failure(default_profiler_config): """Test behavior when the underlying _start method raises exception.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.should_fail_start = True profiler.start() @@ -168,9 +169,9 @@ def test_start_failure(): assert profiler.start_call_count == 0 # Logic failed inside start -def test_shutdown(): +def test_shutdown(default_profiler_config): """Test that shutdown calls stop only if running.""" - profiler = ConcreteWorkerProfiler() + profiler = ConcreteWorkerProfiler(default_profiler_config) # Case 1: Not running profiler.shutdown() @@ -182,10 +183,10 @@ def test_shutdown(): assert profiler.stop_call_count == 1 -def test_mixed_delay_and_stop(): +def test_mixed_delay_and_stop(default_profiler_config): """Test manual stop during the delay period.""" - envs.VLLM_PROFILER_DELAY_ITERS = 5 - profiler = ConcreteWorkerProfiler() + default_profiler_config.delay_iterations = 5 + profiler = ConcreteWorkerProfiler(default_profiler_config) profiler.start() profiler.step() diff --git a/tools/ep_kernels/README.md b/tools/ep_kernels/README.md index 85e9d2a4f8129..ab0e358802bf8 100644 --- a/tools/ep_kernels/README.md +++ b/tools/ep_kernels/README.md @@ -7,7 +7,7 @@ Here we break down the requirements in 2 steps: 1. Build and install the Python libraries (both [pplx-kernels](https://github.com/ppl-ai/pplx-kernels) and [DeepEP](https://github.com/deepseek-ai/DeepEP)), including necessary dependencies like NVSHMEM. This step does not require any privileged access. Any user can do this. 2. Configure NVIDIA driver to enable IBGDA. This step requires root access, and must be done on the host machine. -2 is necessary for multi-node deployment. +Step 2 is necessary for multi-node deployment. All scripts accept a positional argument as workspace path for staging the build, defaulting to `$(pwd)/ep_kernels_workspace`. @@ -23,6 +23,6 @@ TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh Additional step for multi-node deployment: ```bash -sudo bash configure_system_drivers.sh +sudo bash configure_system_drivers.sh # update-initramfs can take several minutes sudo reboot # Reboot is required to load the new driver ``` diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 35920d826578e..010817e79a936 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -9,6 +9,8 @@ import vllm.envs as envs from vllm.platforms import current_platform from vllm.utils.torch_utils import direct_register_custom_op, is_torch_equal_or_newer +_FP8_DTYPE = current_platform.fp8_dtype() + def is_aiter_found() -> bool: from importlib.util import find_spec @@ -22,6 +24,15 @@ def is_aiter_found() -> bool: # we keep this global outside to not cause torch compile breaks. IS_AITER_FOUND = is_aiter_found() +# Can't use dtypes.fp8 directly inside an op +# because it returns wrong result on gfx942. +# This is a workaround to get the correct FP8 dtype. +# This might because that the get_gfx() is wrapped as a custom op. +if IS_AITER_FOUND: + from aiter import dtypes + + AITER_FP8_DTYPE = dtypes.fp8 + def if_aiter_supported(func: Callable) -> Callable: """Decorator that only executes the function if @@ -43,36 +54,6 @@ def if_aiter_supported(func: Callable) -> Callable: return wrapper -def _rocm_aiter_group_fp8_quant_impl( - x: torch.Tensor, - group_size: int, -) -> tuple[torch.Tensor, torch.Tensor]: - assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size" - from aiter import QuantType, dtypes, get_hip_quant - - aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128) - return aiter_per1x128_quant(x.contiguous(), quant_dtype=dtypes.fp8) - - -def _rocm_aiter_group_fp8_quant_fake( - x: torch.Tensor, - group_size: int, -) -> tuple[torch.Tensor, torch.Tensor]: - from aiter import dtypes - - M, N = x.shape - x_fp8 = torch.empty((M, N), dtype=dtypes.fp8, device=x.device) - out_bs = torch.empty( - ( - M, - (N + group_size - 1) // group_size, - ), - dtype=torch.float32, - device=x.device, - ) - return x_fp8, out_bs - - def _rocm_aiter_fused_moe_impl( hidden_states: torch.Tensor, w1: torch.Tensor, @@ -467,6 +448,195 @@ def _rocm_aiter_rmsnorm2d_fwd_with_add_fake( return torch.empty_like(x), torch.empty_like(residual) +def _rocm_aiter_per_tensor_quant_impl( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.quant import per_tensor_quant_hip + + return per_tensor_quant_hip(x, scale, quant_dtype) + + +def _rocm_aiter_per_tensor_quant_fake( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, +) -> tuple[torch.Tensor, torch.Tensor]: + return torch.empty_like(x, dtype=quant_dtype), torch.empty( + 1, dtype=torch.float32, device=x.device + ) + + +def _rocm_aiter_per_token_quant_impl( + x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.quant import dynamic_per_token_scaled_quant + + assert quant_dtype in [torch.int8, _FP8_DTYPE] + + out_shape = x.shape + out = torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device) + if scale is None: + scale = torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device) + dynamic_per_token_scaled_quant( + out, + x, + scale, + scale_ub=None, + shuffle_scale=False, + num_rows=None, + num_rows_factor=1, + ) + return out, scale + + +def _rocm_aiter_per_token_quant_fake( + x: torch.Tensor, quant_dtype: torch.dtype, scale: torch.Tensor | None = None +) -> tuple[torch.Tensor, torch.Tensor]: + out_shape = x.shape + return ( + torch.empty(x.shape, dtype=_FP8_DTYPE, device=x.device), + torch.empty((*out_shape[:-1], 1), dtype=torch.float32, device=x.device), + ) + + +def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant + + (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant( + x, + weight, + variance_epsilon, + None, + None, + None, + group_size=group_size, + dtype_quant=AITER_FP8_DTYPE, + res1=residual, + ) + return (x_quant, x_quant_scales, res) + + +def _rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake( + x: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + M, N = x.shape + scale_shape = (M, (N + group_size - 1) // group_size) + return ( + torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device), + torch.empty(scale_shape, dtype=torch.float32, device=x.device), + torch.empty_like(residual, device=residual.device), + ) + + +def _rocm_aiter_rmsnorm_fp8_group_quant_impl( + x: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.triton.fused_fp8_quant import fused_rms_fp8_group_quant + + (x_quant, x_quant_scales), _, _, res = fused_rms_fp8_group_quant( + x, + weight, + variance_epsilon, + None, + None, + None, + group_size=group_size, + dtype_quant=AITER_FP8_DTYPE, + res1=None, + ) + return (x_quant, x_quant_scales) + + +def _rocm_aiter_rmsnorm_fp8_group_quant_fake( + x: torch.Tensor, + weight: torch.Tensor, + variance_epsilon: float, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + scale_shape = (M, (N + group_size - 1) // group_size) + return ( + torch.empty_like(x, dtype=AITER_FP8_DTYPE, device=x.device), + torch.empty(scale_shape, dtype=torch.float32, device=x.device), + ) + + +def _rocm_aiter_group_fp8_quant_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + assert x.shape[-1] % group_size == 0, "Input shape must be divisible by group size" + from aiter import QuantType, get_hip_quant + + aiter_per1x128_quant = get_hip_quant(QuantType.per_1x128) + return aiter_per1x128_quant(x.contiguous(), quant_dtype=AITER_FP8_DTYPE) + + +def _rocm_aiter_group_fp8_quant_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + x_fp8 = torch.empty((M, N), dtype=AITER_FP8_DTYPE, device=x.device) + out_bs = torch.empty( + ( + M, + (N + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + +def _rocm_aiter_act_mul_and_fp8_group_quant_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + from aiter.ops.triton.activation import act_mul_and_fp8_group_quant + + return act_mul_and_fp8_group_quant( + x, + activation="silu", + group_size=group_size, + dtype_quant=AITER_FP8_DTYPE, + ) + + +def _rocm_aiter_act_mul_and_fp8_group_quant_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + assert N % 2 == 0 + N_half = N // 2 + x_fp8 = torch.empty((M, N_half), dtype=AITER_FP8_DTYPE, device=x.device) + out_bs = torch.empty( + ( + M, + (N_half + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + # Global flag to ensure ops are registered only once _OPS_REGISTERED = False @@ -502,7 +672,7 @@ class rocm_aiter_ops: @if_aiter_supported def is_linear_fp8_enaled(cls) -> bool: """ "Verifies device specs and availability of env variable.""" - return cls.is_linear_enabled() and current_platform.is_fp8_fnuz() + return cls.is_linear_enabled() @classmethod @if_aiter_supported @@ -577,14 +747,6 @@ class rocm_aiter_ops: ) # register all the custom ops here - direct_register_custom_op( - op_name="rocm_aiter_group_fp8_quant", - op_func=_rocm_aiter_group_fp8_quant_impl, - mutates_args=[], - fake_impl=_rocm_aiter_group_fp8_quant_fake, - dispatch_key=current_platform.dispatch_key, - ) - direct_register_custom_op( op_name="rocm_aiter_asm_moe_tkw1", op_func=_rocm_aiter_asm_moe_tkw1_impl, @@ -644,27 +806,62 @@ class rocm_aiter_ops: direct_register_custom_op( op_name="rocm_aiter_gemm_a8w8_blockscale", op_func=_rocm_aiter_gemm_a8w8_blockscale_impl, - mutates_args=[], fake_impl=_rocm_aiter_gemm_a8w8_blockscale_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="rocm_aiter_rms_norm", op_func=_rocm_aiter_rms_norm_impl, - mutates_args=[], fake_impl=_rocm_aiter_rms_norm_fake, - dispatch_key=current_platform.dispatch_key, ) direct_register_custom_op( op_name="rocm_aiter_rmsnorm2d_fwd_with_add", op_func=_rocm_aiter_rmsnorm2d_fwd_with_add_impl, - mutates_args=[], fake_impl=_rocm_aiter_rmsnorm2d_fwd_with_add_fake, dispatch_key=current_platform.dispatch_key, ) + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm_fp8_group_quant", + op_func=_rocm_aiter_rmsnorm_fp8_group_quant_impl, + fake_impl=_rocm_aiter_rmsnorm_fp8_group_quant_fake, + ) + + direct_register_custom_op( + op_name="rocm_aiter_rmsnorm_with_add_fp8_group_quant", + op_func=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_impl, + fake_impl=_rocm_aiter_rmsnorm_with_add_fp8_group_quant_fake, + ) + + direct_register_custom_op( + op_name="rocm_aiter_act_mul_and_fp8_group_quant", + op_func=_rocm_aiter_act_mul_and_fp8_group_quant_impl, + fake_impl=_rocm_aiter_act_mul_and_fp8_group_quant_fake, + ) + + direct_register_custom_op( + op_name="rocm_aiter_group_fp8_quant", + op_func=_rocm_aiter_group_fp8_quant_impl, + fake_impl=_rocm_aiter_group_fp8_quant_fake, + ) + + direct_register_custom_op( + op_name="rocm_aiter_per_tensor_quant", + op_func=_rocm_aiter_per_tensor_quant_impl, + mutates_args=[], + fake_impl=_rocm_aiter_per_tensor_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + + direct_register_custom_op( + op_name="rocm_aiter_per_token_quant", + op_func=_rocm_aiter_per_token_quant_impl, + mutates_args=["scale"], + fake_impl=_rocm_aiter_per_token_quant_fake, + dispatch_key=current_platform.dispatch_key, + ) + _OPS_REGISTERED = True @staticmethod @@ -859,6 +1056,22 @@ class rocm_aiter_ops: kv_scale=kv_scale, ) + @staticmethod + def per_tensor_quant( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops.vllm.rocm_aiter_per_tensor_quant(x, quant_dtype, scale) + + @staticmethod + def per_token_quant( + x: torch.Tensor, + quant_dtype: torch.dtype, + scale: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops.vllm.rocm_aiter_per_token_quant(x, quant_dtype, scale) + @staticmethod def triton_fp4_gemm_dynamic_qaunt( x: torch.Tensor, diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py index 77d5453291e3c..6d862c5812560 100644 --- a/vllm/_custom_ops.py +++ b/vllm/_custom_ops.py @@ -695,6 +695,10 @@ if hasattr(torch.ops._C, "gptq_marlin_24_gemm"): def cutlass_encode_and_reorder_int4b_fake(b: torch.Tensor) -> torch.Tensor: return torch.empty_like(b, memory_format=torch.contiguous_format) + @register_fake("_C::cutlass_encode_and_reorder_int4b_grouped") + def cutlass_encode_and_reorder_int4b_grouped_fake(b: torch.Tensor) -> torch.Tensor: + return torch.empty_like(b, memory_format=torch.contiguous_format) + if hasattr(torch.ops._C, "allspark_w8a16_gemm"): @@ -1058,6 +1062,7 @@ def get_cutlass_moe_mm_problem_sizes( n: int, k: int, blockscale_offsets: torch.Tensor | None = None, + force_swap_ab: bool | None = None, ): """ Compute only the per-expert problem sizes needed by the two grouped matrix @@ -1067,9 +1072,20 @@ def get_cutlass_moe_mm_problem_sizes( - problem_sizes1, problem_sizes2: M×N×K sizes of each expert's multiplication for the two grouped MMs used in the fused MoE operation. + Optional: + - force_swap_ab: If set to True or False, explicitly enable or disable the + A/B input swap optimization. If None (default), the swap + is selected automatically based on tensor sizes. """ return torch.ops._C.get_cutlass_moe_mm_problem_sizes( - topk_ids, problem_sizes1, problem_sizes2, num_experts, n, k, blockscale_offsets + topk_ids, + problem_sizes1, + problem_sizes2, + num_experts, + n, + k, + blockscale_offsets, + force_swap_ab, ) @@ -1457,6 +1473,78 @@ def cutlass_encode_and_reorder_int4b(b: torch.Tensor) -> torch.Tensor: return torch.ops._C.cutlass_encode_and_reorder_int4b(b) +def cutlass_w4a8_moe_mm( + out_tensors: torch.Tensor, + a_tensors: torch.Tensor, + b_tensors: torch.Tensor, + a_scales: torch.Tensor, + b_scales: torch.Tensor, + b_group_scales: torch.Tensor, + b_group_size: int, + expert_offsets: torch.Tensor, + problem_sizes: torch.Tensor, + a_strides: torch.Tensor, + b_strides: torch.Tensor, + c_strides: torch.Tensor, + group_scale_strides: torch.Tensor, + maybe_schedule: str | None = None, +): + """ + Executes the CUTLASS-based fused-MoE grouped matrix multiplication for the + W4A8 quantization scheme. Uses group-wise quantization (INT4 -> FP8) + and both per-channel + per-token scaling in the epilogue. + + Args: + out_tensors: + Output buffer for all experts (updated in-place). + a_tensors: + FP8 (E4M3FN) activations for all experts. + b_tensors: + INT4-packed weight matrix for all experts, packed to INT32 + a_scales: + Per-token FP8 activation scales, applied in the epilogue. + b_scales: + Per-channel FP8 weight scales for each expert, applied in the epilogue. + b_group_scales: + FP8 scale values for group-wise INT4 weight blocks. + b_group_size: + Number of elements grouped under each entry of b_group_scales. + expert_offsets: + Cumulative token offsets + problem_sizes: + Per-expert (M, N, K) GEMM sizes used by the grouped GEMM launcher. + a/b/c/group_scale_strides: + Strides describing the memory layout of the input tensors. + maybe_schedule: + Optional override to choose a specific kernel or epilogue schedule. + + Returns: + out_tensors updated in-place with the dequantized INT4xFP8 grouped GEMM result. + """ + return torch.ops._C.cutlass_w4a8_moe_mm( + out_tensors, + a_tensors, + b_tensors, + a_scales, + b_scales, + b_group_scales, + b_group_size, + expert_offsets, + problem_sizes, + a_strides, + b_strides, + c_strides, + group_scale_strides, + maybe_schedule, + ) + + +def cutlass_encode_and_reorder_int4b_grouped( + b_tensors: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + return torch.ops._C.cutlass_encode_and_reorder_int4b_grouped(b_tensors) + + if hasattr(torch.ops._C, "permute_cols"): @register_fake("_C::permute_cols") @@ -1638,7 +1726,7 @@ def scaled_fp8_quant( output, input, scale, scale_ub ) else: - scale = torch.empty((1, 1), device=input.device, dtype=torch.float32) + scale = torch.empty(1, device=input.device, dtype=torch.float32) torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale) else: assert scale.numel() == 1, f"{scale.shape}" @@ -1961,6 +2049,7 @@ def moe_lora_align_block_size( num_tokens_post_pad: torch.Tensor, adapter_enabled: torch.Tensor, lora_ids: torch.Tensor, + expert_map: torch.Tensor | None = None, ) -> None: torch.ops._moe_C.moe_lora_align_block_size( topk_ids, @@ -1975,6 +2064,7 @@ def moe_lora_align_block_size( num_tokens_post_pad, adapter_enabled, lora_ids, + expert_map, ) diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py index 03f4c40302eb8..025ede1eb0a4e 100644 --- a/vllm/attention/backends/abstract.py +++ b/vllm/attention/backends/abstract.py @@ -294,6 +294,12 @@ class AttentionImpl(ABC, Generic[T]): # Some features like decode context parallelism require the softmax lse. can_return_lse_for_decode: bool = False + # Whether the attention impl supports Prefill Context Parallelism. + supports_pcp: bool = False + # Whether the attention impl(or ops) supports MTP + # when cp_kv_cache_interleave_size > 1 + supports_mtp_with_cp_non_trivial_interleave_size: bool = False + # some attention backends might not always want to return lse # even if they can return lse (for efficiency reasons) need_to_return_lse_for_decode: bool = False diff --git a/vllm/attention/backends/registry.py b/vllm/attention/backends/registry.py index 125e4e3827747..eaa0fa1d5db39 100644 --- a/vllm/attention/backends/registry.py +++ b/vllm/attention/backends/registry.py @@ -252,35 +252,3 @@ def register_backend( return lambda x: x return decorator - - -# Backwards compatibility alias for plugins -class _BackendMeta(type): - """Metaclass to provide deprecation warnings when accessing _Backend.""" - - def __getattribute__(cls, name: str): - if name not in ("__class__", "__mro__", "__name__"): - logger.warning( - "_Backend has been renamed to AttentionBackendEnum. " - "Please update your code to use AttentionBackendEnum instead. " - "_Backend will be removed in a future release." - ) - return getattr(AttentionBackendEnum, name) - - def __getitem__(cls, name: str): - logger.warning( - "_Backend has been renamed to AttentionBackendEnum. " - "Please update your code to use AttentionBackendEnum instead. " - "_Backend will be removed in a future release." - ) - return AttentionBackendEnum[name] - - -class _Backend(metaclass=_BackendMeta): - """Deprecated: Use AttentionBackendEnum instead. - - This class is provided for backwards compatibility with plugins - and will be removed in a future release. - """ - - pass diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 340b161ea1e15..c77fc0fad0038 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -25,6 +25,7 @@ from vllm.config.vllm import VllmConfig from vllm.forward_context import ForwardContext, get_forward_context from vllm.logger import init_logger from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +from vllm.model_executor.layers.batch_invariant import vllm_is_batch_invariant from vllm.model_executor.layers.linear import ( ColumnParallelLinear, UnquantizedLinearMethod, @@ -88,7 +89,10 @@ def maybe_get_vit_flash_attn_backend( if attn_backend == AttentionBackendEnum.ROCM_AITER_FA: from aiter import flash_attn_varlen_func else: - from vllm.attention.utils.fa_utils import flash_attn_varlen_func + try: + from vllm.attention.utils.fa_utils import flash_attn_varlen_func + except ImportError: + flash_attn_varlen_func = None else: flash_attn_varlen_func = None @@ -251,6 +255,24 @@ class Attention(nn.Module, AttentionLayerBase): else: self.attn_backend = attn_backend + # prefix caching + batch invariance is currently not supported for + # FLASHINFER and TRITON_MLA. + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "FLASHINFER" + or self.attn_backend.get_name() == "TRITON_MLA" + ) + ): + logger.warning_once( + "Disabling prefix caching for FLASHINFER/TRITON_MLA " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + impl_cls = self.attn_backend.get_impl_cls() self.impl = impl_cls( num_heads, @@ -628,6 +650,23 @@ class MLAAttention(nn.Module, AttentionLayerBase): use_mla=True, use_sparse=use_sparse, ) + + if ( + cache_config is not None + and cache_config.enable_prefix_caching + and vllm_is_batch_invariant() + and ( + self.attn_backend.get_name() == "TRITON_MLA" + or self.attn_backend.get_name() == "FLASHINFER" + ) + ): + logger.warning_once( + "Disabling prefix caching for TRITON_MLA / FLASHINFER " + "with batch invariance, as it is not yet supported.", + scope="local", + ) + cache_config.enable_prefix_caching = False + impl_cls = cast(type[MLAAttentionImpl], self.attn_backend.get_impl_cls()) self.impl = impl_cls( num_heads=self.num_heads, diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index 068fd0a0eb7d0..cfd203bdd37b9 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -103,7 +103,7 @@ def create_cross_attention_backend( # needed here to know how many tokens to attend to from the cached # cross-attention KV cache. new_metadata.seq_lens = common_attn_metadata.encoder_seq_lens - new_metadata.seq_lens_cpu = torch.from_numpy( + new_metadata._seq_lens_cpu = torch.from_numpy( common_attn_metadata.encoder_seq_lens_cpu ) diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py index f6aba271d2e96..bbf95ff009001 100644 --- a/vllm/attention/selector.py +++ b/vllm/attention/selector.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -import inspect from functools import cache from typing import cast, get_args @@ -73,39 +72,18 @@ def _cached_get_attn_backend( ) -> type[AttentionBackend]: from vllm.platforms import current_platform - sig = inspect.signature(current_platform.get_attn_backend_cls) - if "use_v1" in sig.parameters: - logger.warning_once( - "use_v1 parameter for get_attn_backend_cls is deprecated and will " - "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please " - "remove it from your plugin code." - ) - attention_cls = current_platform.get_attn_backend_cls( - backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - True, # use_v1 - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type, - ) - else: - attention_cls = current_platform.get_attn_backend_cls( - backend, - head_size, - dtype, - kv_cache_dtype, - block_size, - use_mla, - has_sink, - use_sparse, - use_mm_prefix, - attn_type, - ) + attention_cls = current_platform.get_attn_backend_cls( + backend, + head_size, + dtype, + kv_cache_dtype, + block_size, + use_mla, + has_sink, + use_sparse, + use_mm_prefix, + attn_type, + ) if not attention_cls: raise ValueError( f"Invalid attention backend for {current_platform.device_name}" diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py index b4f1751837f48..99c1c846f19af 100644 --- a/vllm/benchmarks/latency.py +++ b/vllm/benchmarks/latency.py @@ -12,7 +12,6 @@ from typing import Any import numpy as np from tqdm import tqdm -import vllm.envs as envs from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.engine.arg_utils import EngineArgs from vllm.inputs import PromptType @@ -79,12 +78,11 @@ def add_cli_args(parser: argparse.ArgumentParser): def main(args: argparse.Namespace): - if args.profile and not envs.VLLM_TORCH_PROFILER_DIR: - raise OSError( - "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. " - "Please set it to a valid path to use torch profiler." - ) engine_args = EngineArgs.from_cli_args(args) + if args.profile and not engine_args.profiler_config.profiler == "torch": + raise ValueError( + "The torch profiler is not enabled. Please provide profiler_config." + ) # Lazy import to avoid importing LLM when the bench command is not selected. from vllm import LLM, SamplingParams @@ -144,7 +142,7 @@ def main(args: argparse.Namespace): run_to_completion(profile_dir=None) if args.profile: - profile_dir = envs.VLLM_TORCH_PROFILER_DIR + profile_dir = engine_args.profiler_config.torch_profiler_dir print(f"Profiling (results will be saved to '{profile_dir}')...") run_to_completion(profile_dir=profile_dir) return diff --git a/vllm/benchmarks/serve.py b/vllm/benchmarks/serve.py index 568290aa894ff..254e4d35e5350 100644 --- a/vllm/benchmarks/serve.py +++ b/vllm/benchmarks/serve.py @@ -788,7 +788,7 @@ async def benchmark( ) print( "{:<40} {:<10.2f}".format( - "Total Token throughput (tok/s):", metrics.total_token_throughput + "Total token throughput (tok/s):", metrics.total_token_throughput ) ) @@ -1097,8 +1097,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) parser.add_argument( "--save-result", diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index ea693613fdd16..d824e982b7489 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -655,8 +655,7 @@ def add_cli_args(parser: argparse.ArgumentParser): "--profile", action="store_true", default=False, - help="Use Torch Profiler. The env variable " - "VLLM_TORCH_PROFILER_DIR must be set to enable profiler.", + help="Use vLLM Profiling. --profiler-config must be provided on the server.", ) # prefix repetition dataset diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py index 8159b817f637a..dbf154eeb86a4 100644 --- a/vllm/compilation/inductor_pass.py +++ b/vllm/compilation/inductor_pass.py @@ -1,6 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from __future__ import annotations + import functools import hashlib import inspect @@ -8,15 +10,17 @@ import json import types from collections.abc import Callable from contextlib import contextmanager -from typing import Any +from typing import TYPE_CHECKING, Any import torch from torch import fx from torch._subclasses.fake_tensor import FakeTensorMode, unset_fake_temporarily -from vllm.config.utils import Range from vllm.utils.torch_utils import is_torch_equal_or_newer +if TYPE_CHECKING: + from vllm.config.utils import Range + if is_torch_equal_or_newer("2.6"): from torch._inductor.custom_graph_pass import CustomGraphPass else: diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py index 6848bfb6a3c53..4ebb386f75ed8 100644 --- a/vllm/compilation/pass_manager.py +++ b/vllm/compilation/pass_manager.py @@ -5,6 +5,7 @@ import functools from torch import fx as fx from vllm import envs +from vllm._aiter_ops import rocm_aiter_ops from vllm.config import VllmConfig, set_current_vllm_config from vllm.logger import init_logger from vllm.platforms import current_platform @@ -13,6 +14,12 @@ from vllm.utils.system_utils import set_env_var from .post_cleanup import PostCleanupPass from .vllm_inductor_pass import VllmInductorPass +if rocm_aiter_ops.is_enabled(): + from vllm.compilation.rocm_aiter_fusion import ( + RocmAiterRMSNormFp8GroupQuantFusionPass, + RocmAiterSiluMulFp8GroupQuantFusionPass, + ) + if current_platform.is_cuda_alike(): from .activation_quant_fusion import ActivationQuantFusionPass from .fusion import RMSNormQuantFusionPass @@ -109,8 +116,12 @@ class PostGradPassManager(CustomGraphPass): if self.pass_config.fuse_norm_quant: self.passes += [RMSNormQuantFusionPass(config)] + if rocm_aiter_ops.is_enabled(): + self.passes += [RocmAiterRMSNormFp8GroupQuantFusionPass(config)] if self.pass_config.fuse_act_quant: self.passes += [ActivationQuantFusionPass(config)] + if rocm_aiter_ops.is_enabled(): + self.passes += [RocmAiterSiluMulFp8GroupQuantFusionPass(config)] if self.pass_config.fuse_attn_quant: self.passes += [AttnFusionPass(config)] diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py index 129b9b5deea31..a15c693767a51 100644 --- a/vllm/compilation/piecewise_backend.py +++ b/vllm/compilation/piecewise_backend.py @@ -53,8 +53,27 @@ class PiecewiseBackend: self.is_last_graph = piecewise_compile_index == total_piecewise_compiles - 1 self.is_full_graph = total_piecewise_compiles == 1 + # TODO: we need to generalize encoder compilation to other models + self.is_encoder_compilation = vllm_backend.prefix in [ + "Qwen2_5_VisionPatchEmbed", + "Qwen2_5_VisionPatchMerger", + "Qwen2_5_VisionBlock", + ] self.compile_ranges = self.compilation_config.get_compile_ranges() + if self.is_encoder_compilation: + # For encoder compilation we use the max int32 value + # to set the upper bound of the compile ranges + max_int32 = 2**31 - 1 + last_compile_range = self.compile_ranges[-1] + assert ( + last_compile_range.end + == vllm_config.scheduler_config.max_num_batched_tokens + ) + self.compile_ranges[-1] = Range( + start=last_compile_range.start, end=max_int32 + ) + log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}" logger.debug_once(log_string) diff --git a/vllm/compilation/rocm_aiter_fusion.py b/vllm/compilation/rocm_aiter_fusion.py new file mode 100644 index 0000000000000..8b5db9de38181 --- /dev/null +++ b/vllm/compilation/rocm_aiter_fusion.py @@ -0,0 +1,242 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +import torch +import torch._inductor.pattern_matcher as pm +from torch import fx +from torch._inductor.pattern_matcher import PatternMatcherPass +from torch._ops import OpOverload + +import vllm.model_executor.layers.quantization.utils.fp8_utils # noqa: F401 +from vllm.compilation.activation_quant_fusion import ActivationQuantPattern +from vllm.config import VllmConfig +from vllm.logger import init_logger +from vllm.platforms import current_platform + +from .fusion import empty_bf16 +from .inductor_pass import enable_fake_mode +from .matcher_utils import MatcherSiluAndMul +from .vllm_inductor_pass import VllmInductorPass, VllmPatternMatcherPass + +logger = init_logger(__name__) +FP8_DTYPE = current_platform.fp8_dtype() + +AITER_RMS_GROUP_QUANT_OP = torch.ops.vllm.rocm_aiter_rmsnorm_fp8_group_quant.default +AITER_RMS_ADD_GROUP_QUANT_OP = ( + torch.ops.vllm.rocm_aiter_rmsnorm_with_add_fp8_group_quant.default +) + +AITER_RMS_OP = torch.ops.vllm.rocm_aiter_rms_norm.default +AITER_RMS_ADD_OP = torch.ops.vllm.rocm_aiter_rmsnorm2d_fwd_with_add.default + +AITER_GROUP_FP8_QUANT_OP = torch.ops.vllm.rocm_aiter_group_fp8_quant.default +TRITON_GROUP_FP8_QUANT_OP = torch.ops.vllm.triton_per_token_group_quant_fp8.default + +FUSED_SILU_MUL_QUANT_OP = torch.ops.vllm.rocm_aiter_act_mul_and_fp8_group_quant.default + + +class AiterRMSFp8GroupQuantPattern: + """ + This pattern fuses aiter rms_norm & group fp8 quant custom + ops into an aiter rms_norm_group_fp8_quant op. + """ + + def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): + self.epsilon = epsilon + self.quant_dtype = quant_dtype + self.quant_op = quant_op + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + weight: torch.Tensor, + ): + at1 = AITER_RMS_OP(x=input, weight=weight, variance_epsilon=self.epsilon) + + at2 = self.quant_op(at1, 128) + + return at2[0], at2[1] + + def replacement( + input: torch.Tensor, + weight: torch.Tensor, + ): + at = AITER_RMS_GROUP_QUANT_OP( + x=input, + weight=weight, + variance_epsilon=self.epsilon, + group_size=128, + ) + + return at[0], at[1] + + inputs = [ + empty_bf16(5, 4), # input + empty_bf16(1, 5), # weight + ] + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + + +class AiterFusedAddRMSFp8GroupQuantPattern: + """ + This pattern fuses aiter rms_norm_with_add & group fp8 quant custom ops + into a aiter rms_norm_with_add_group_fp8_quant op. + """ + + def __init__(self, epsilon: float, quant_dtype: torch.dtype, quant_op: OpOverload): + self.epsilon = epsilon + self.quant_dtype = quant_dtype + self.quant_op = quant_op + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + ): + at1 = AITER_RMS_ADD_OP( + x=input, + residual=residual, + weight=weight, + variance_epsilon=self.epsilon, + ) + + at2 = self.quant_op(at1[0], 128) + + # result, scale, residual + return at2[0], at2[1], at1[1] + + def replacement( + input: torch.Tensor, + residual: torch.Tensor, + weight: torch.Tensor, + ): + at = AITER_RMS_ADD_GROUP_QUANT_OP( + x=input, + residual=residual, + weight=weight, + variance_epsilon=self.epsilon, + group_size=128, + ) + + # result, scale, residual + return at[0], at[1], at[2] + + inputs = [ + empty_bf16(5, 4), # input + empty_bf16(5, 4), # residual + empty_bf16(1, 5), # weight + ] + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + + +class RocmAiterRMSNormFp8GroupQuantFusionPass(VllmPatternMatcherPass): + """ + This pass fuses rms_norm & quant custom ops into a fused rms_norm_quant op. + It also supports fused_add_rms_norm. + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="rocm_aiter_rms_norm_fp8_group_quant_fusion_pass" + ) + + # Make sure fused add patterns are before simple rms norm, + # as the latter is a subset of the former in torch ops + for epsilon in [1e-5, 1e-6]: + # Fuse rms_norm + dynamic group fp8 quant + for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: + AiterRMSFp8GroupQuantPattern(epsilon, FP8_DTYPE, quant_op).register( + self.patterns + ) + + AiterFusedAddRMSFp8GroupQuantPattern( + epsilon, FP8_DTYPE, quant_op + ).register(self.patterns) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + def uuid(self) -> Any: + fusion_patterns = [ + AiterRMSFp8GroupQuantPattern, + AiterFusedAddRMSFp8GroupQuantPattern, + ] + return self.hash_source(self, *fusion_patterns) + + +class AiterSiluMulFp8GroupQuantPattern(ActivationQuantPattern): + """ + This pattern fuses aiter silu_and_mul & group fp8 quant custom + ops into an aiter silu_and_mul_group_fp8_quant op. + """ + + def __init__(self, quant_op: OpOverload): + self.silu_and_mul_matcher = MatcherSiluAndMul() + self.quant_op = quant_op + + def register(self, pm_pass: PatternMatcherPass): + def pattern( + input: torch.Tensor, + ): + at1 = self.silu_and_mul_matcher(input) + at2 = self.quant_op(at1, 128) + return at2[0], at2[1] + + def replacement( + input: torch.Tensor, + ): + at = FUSED_SILU_MUL_QUANT_OP(x=input, group_size=128) + return at[0], at[1] + + inputs = [ + self.silu_and_mul_matcher.inputs()[0], + ] + + pm.register_replacement(pattern, replacement, inputs, pm.fwd_only, pm_pass) + + +class RocmAiterSiluMulFp8GroupQuantFusionPass(VllmPatternMatcherPass): + """ + This pass fuses a pre-defined set of custom ops into fused ops. + It uses the torch pattern matcher to find the patterns and replace them. + + Because patterns can only be registered once, the pass is a singleton. + This will be addressed in a future version of PyTorch: + https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980 + """ + + @enable_fake_mode + def __init__(self, config: VllmConfig): + super().__init__(config) + + self.patterns: PatternMatcherPass = PatternMatcherPass( + pass_name="rocm_aiter_silu_mul_fp8_group_quant_fusion_pass" + ) + + for quant_op in [AITER_GROUP_FP8_QUANT_OP, TRITON_GROUP_FP8_QUANT_OP]: + AiterSiluMulFp8GroupQuantPattern(quant_op).register(self.patterns) + + self.dump_patterns(config, self.patterns) + + @VllmInductorPass.time_and_log + def __call__(self, graph: torch.fx.Graph): + self.matched_count = self.patterns.apply(graph) + logger.debug("Replaced %s patterns", self.matched_count) + + def uuid(self): + fusion_patterns = [ + ActivationQuantPattern, + AiterSiluMulFp8GroupQuantPattern, + ] + return VllmInductorPass.hash_source(self, *fusion_patterns) diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0f84f3ca9d3e3..0e91dd57420a8 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.config.observability import ObservabilityConfig from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig +from vllm.config.profiler import ProfilerConfig from vllm.config.scheduler import SchedulerConfig from vllm.config.speculative import SpeculativeConfig from vllm.config.speech_to_text import SpeechToTextConfig @@ -89,6 +90,8 @@ __all__ = [ "SpeechToTextConfig", # From vllm.config.structured_outputs "StructuredOutputsConfig", + # From vllm.config.profiler + "ProfilerConfig", # From vllm.config.utils "ConfigType", "SupportsMetricsInfo", diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index 51e4912aad9db..3b6cb8a343608 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -17,7 +17,6 @@ from vllm.config.utils import ( Range, config, get_hash_factors, - handle_deprecated, hash_factors, ) from vllm.logger import init_logger @@ -127,27 +126,6 @@ class PassConfig: fuse_allreduce_rms: bool = Field(default=None) """Enable flashinfer allreduce fusion.""" - # Deprecated flags - enable_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_norm_quant and fuse_act_quant - instead. Will be removed in v0.13.0 or v1.0.0, whichever is sooner. - """ - enable_attn_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_attn_quant instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_noop: bool = Field(default=None) - """Deprecated in: v0.12.0. Use eliminate_noops instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_sequence_parallelism: bool = Field(default=None) - """Deprecated in: v0.12.0. Use enable_sp instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_async_tp: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_gemm_comms instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - enable_fi_allreduce_fusion: bool = Field(default=None) - """Deprecated in: v0.12.0. Use fuse_allreduce_rms instead. - Will be removed in v0.13.0 or v1.0.0, whichever is sooner.""" - fi_allreduce_fusion_max_size_mb: float | None = None """The threshold of the communicated tensor sizes under which vllm should use flashinfer fused allreduce. Specified as a @@ -206,15 +184,7 @@ class PassConfig: Any future fields that don't affect compilation should be excluded. """ - ignored_fields = [ - "enable_fusion", - "enable_attn_fusion", - "enable_noop", - "enable_sequence_parallelism", - "enable_async_tp", - "enable_fi_allreduce_fusion", - ] - return hash_factors(get_hash_factors(self, ignored_factors=ignored_fields)) + return hash_factors(get_hash_factors(self, set())) @field_validator( "fuse_norm_quant", @@ -224,12 +194,6 @@ class PassConfig: "enable_sp", "fuse_gemm_comms", "fuse_allreduce_rms", - "enable_fusion", - "enable_attn_fusion", - "enable_noop", - "enable_sequence_parallelism", - "enable_async_tp", - "enable_fi_allreduce_fusion", mode="wrap", ) @classmethod @@ -242,49 +206,6 @@ class PassConfig: def __post_init__(self) -> None: # Handle deprecation and defaults - # Map old flags to new flags and issue warnings - handle_deprecated( - self, - "enable_fusion", - ["fuse_norm_quant", "fuse_act_quant"], - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_attn_fusion", - "fuse_attn_quant", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_sequence_parallelism", - "enable_sp", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_async_tp", - "fuse_gemm_comms", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_fi_allreduce_fusion", - "fuse_allreduce_rms", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - - handle_deprecated( - self, - "enable_noop", - "eliminate_noops", - "v0.13.0 or v1.0.0, whichever is sooner", - ) - if not self.eliminate_noops: if self.fuse_norm_quant or self.fuse_act_quant: logger.warning_once( diff --git a/vllm/config/kv_transfer.py b/vllm/config/kv_transfer.py index 88f8b91c292bb..98cea821c678e 100644 --- a/vllm/config/kv_transfer.py +++ b/vllm/config/kv_transfer.py @@ -64,6 +64,11 @@ class KVTransferConfig: enable_permute_local_kv: bool = False """Experiment feature flag to enable HND to NHD KV Transfer""" + kv_load_failure_policy: Literal["recompute", "fail"] = "recompute" + """Policy for handling KV cache load failures. + 'recompute': reschedule the request to recompute failed blocks (default) + 'fail': immediately fail the request with an error finish reason""" + def compute_hash(self) -> str: """ WARNING: Whenever a new field is added to this config, diff --git a/vllm/config/model.py b/vllm/config/model.py index 764bdf7000561..03140c17fb50e 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -73,17 +73,6 @@ logger = init_logger(__name__) RunnerOption = Literal["auto", RunnerType] ConvertType = Literal["none", "embed", "classify", "reward"] ConvertOption = Literal["auto", ConvertType] -TaskOption = Literal[ - "auto", - "generate", - "embedding", - "embed", - "classify", - "score", - "reward", - "transcription", - "draft", -] TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ @@ -93,12 +82,6 @@ HfOverrides = dict[str, Any] | Callable[[PretrainedConfig], PretrainedConfig] ModelImpl = Literal["auto", "vllm", "transformers", "terratorch"] LayerBlockType = Literal["attention", "linear_attention", "mamba"] -_RUNNER_TASKS: dict[RunnerType, list[TaskOption]] = { - "generate": ["generate", "transcription"], - "pooling": ["embedding", "embed", "classify", "score", "reward"], - "draft": ["draft"], -} - _RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = { "generate": [], "pooling": ["embed", "classify", "reward"], @@ -126,12 +109,6 @@ class ModelConfig: """Convert the model using adapters defined in [vllm.model_executor.models.adapters][]. The most common use case is to adapt a text generation model to be used for pooling tasks.""" - task: TaskOption | None = None - """[DEPRECATED] The task to use the model for. If the model supports more - than one model runner, this is used to select which model runner to run. - - Note that the model may support other tasks using the same model runner. - """ tokenizer: SkipValidation[str] = None # type: ignore """Name or path of the Hugging Face tokenizer to use. If unspecified, model name or path will be used.""" @@ -335,7 +312,6 @@ class ModelConfig: ignored_factors = { "runner", "convert", - "task", "tokenizer", "tokenizer_mode", "seed", @@ -510,97 +486,6 @@ class ModelConfig: is_generative_model = registry.is_text_generation_model(architectures, self) is_pooling_model = registry.is_pooling_model(architectures, self) - def _task_to_convert(task: TaskOption) -> ConvertType: - if task == "embedding" or task == "embed": - return "embed" - if task == "classify": - return "classify" - if task == "reward": - logger.warning( - "Pooling models now default support all pooling; " - "you can use it without any settings." - ) - return "embed" - if task == "score": - new_task = self._get_default_pooling_task(architectures) - return "classify" if new_task == "classify" else "embed" - - return "none" - - if self.task is not None: - runner: RunnerOption = "auto" - convert: ConvertOption = "auto" - msg_prefix = ( - "The 'task' option has been deprecated and will be " - "removed in v0.13.0 or v1.0, whichever comes first." - ) - msg_hint = "Please remove this option." - - is_generative_task = self.task in _RUNNER_TASKS["generate"] - is_pooling_task = self.task in _RUNNER_TASKS["pooling"] - - if is_generative_model and is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "generate` to continue using this model " - "as a generative model." - ) - elif is_pooling_task: - runner = "pooling" - convert = "auto" - msg_hint = ( - "Please replace this option with `--runner " - "pooling` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - elif is_generative_model or is_pooling_model: - if is_generative_task: - runner = "generate" - convert = "auto" - msg_hint = "Please remove this option" - elif is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--convert " - f"{convert}` to continue using this model " - "as a pooling model." - ) - else: # task == "auto" - pass - else: - # Neither generative nor pooling model - try to convert if possible - if is_pooling_task: - runner = "pooling" - convert = _task_to_convert(self.task) - msg_hint = ( - "Please replace this option with `--runner pooling " - f"--convert {convert}` to continue using this model " - "as a pooling model." - ) - else: - debug_info = { - "architectures": architectures, - "is_generative_model": is_generative_model, - "is_pooling_model": is_pooling_model, - } - raise AssertionError( - "The model should be a generative or " - "pooling model when task is set to " - f"{self.task!r}. Found: {debug_info}" - ) - - self.runner = runner - self.convert = convert - - msg = f"{msg_prefix} {msg_hint}" - warnings.warn(msg, DeprecationWarning, stacklevel=2) - self.runner_type = self._get_runner_type(architectures, self.runner) self.convert_type = self._get_convert_type( architectures, self.runner_type, self.convert @@ -903,6 +788,13 @@ class ModelConfig: runner_type: RunnerType, convert: ConvertOption, ) -> ConvertType: + if convert == "reward": + logger.warning( + "`--convert reward` is deprecated and will be removed in v0.15. " + "Please use `--convert embed` instead." + ) + return "embed" + if convert != "auto": return convert @@ -918,22 +810,6 @@ class ModelConfig: return convert_type - def _get_default_pooling_task( - self, - architectures: list[str], - ) -> Literal["embed", "classify", "reward"]: - if self.registry.is_cross_encoder_model(architectures, self): - return "classify" - - for arch in architectures: - match = try_match_architecture_defaults(arch, runner_type="pooling") - if match: - _, (_, convert_type) = match - assert convert_type != "none" - return convert_type - - return "embed" - def _parse_quant_hf_config(self, hf_config: PretrainedConfig): quant_cfg = getattr(hf_config, "quantization_config", None) if quant_cfg is None: diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 0327832c4fb8c..1f9dd38ac9114 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -317,11 +317,6 @@ class ParallelConfig: "num_redundant_experts." ) - if self.prefill_context_parallel_size > 1: - raise ValueError( - "Prefill context parallelism is not fully supported. " - "Please set prefill_context_parallel_size to 1." - ) return self @property diff --git a/vllm/config/pooler.py b/vllm/config/pooler.py index aa4e7006d0247..976ae8c063eb7 100644 --- a/vllm/config/pooler.py +++ b/vllm/config/pooler.py @@ -111,13 +111,15 @@ class PoolerConfig: def get_use_activation(o: object): if softmax := getattr(o, "softmax", None) is not None: logger.warning_once( - "softmax will be deprecated, please use use_activation instead." + "softmax will be deprecated and will be removed in v0.15. " + "Please use use_activation instead." ) return softmax if activation := getattr(o, "activation", None) is not None: logger.warning_once( - "activation will be deprecated, please use use_activation instead." + "activation will be deprecated and will be removed in v0.15. " + "Please use use_activation instead." ) return activation diff --git a/vllm/config/profiler.py b/vllm/config/profiler.py new file mode 100644 index 0000000000000..76cc546f3c9e2 --- /dev/null +++ b/vllm/config/profiler.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import os +from typing import Any, Literal + +from pydantic import Field, model_validator +from pydantic.dataclasses import dataclass +from typing_extensions import Self + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.utils.hashing import safe_hash + +logger = init_logger(__name__) + +ProfilerKind = Literal["torch", "cuda"] + + +@config +@dataclass +class ProfilerConfig: + """Dataclass which contains profiler config for the engine.""" + + profiler: ProfilerKind | None = None + """Which profiler to use. Defaults to None. Options are: + + - 'torch': Use PyTorch profiler.\n + - 'cuda': Use CUDA profiler.""" + + torch_profiler_dir: str = "" + """Directory to save torch profiler traces. Both AsyncLLM's CPU traces and + worker's traces (CPU & GPU) will be saved under this directory. Note that + it must be an absolute path.""" + + torch_profiler_with_stack: bool = True + """If `True`, enables stack tracing in the torch profiler. Enabled by default.""" + + torch_profiler_with_flops: bool = False + """If `True`, enables FLOPS counting in the torch profiler. Disabled by default.""" + + torch_profiler_use_gzip: bool = True + """If `True`, saves torch profiler traces in gzip format. Enabled by default""" + + torch_profiler_dump_cuda_time_total: bool = True + """If `True`, dumps total CUDA time in torch profiler traces. Enabled by default.""" + + torch_profiler_record_shapes: bool = False + """If `True`, records tensor shapes in the torch profiler. Disabled by default.""" + + torch_profiler_with_memory: bool = False + """If `True`, enables memory profiling in the torch profiler. + Disabled by default.""" + + ignore_frontend: bool = False + """If `True`, disables the front-end profiling of AsyncLLM when using the + 'torch' profiler. This is needed to reduce overhead when using delay/limit options, + since the front-end profiling does not track iterations and will capture the + entire range. + """ + + delay_iterations: int = Field(default=0, ge=0) + """Number of engine iterations to skip before starting profiling. + Defaults to 0, meaning profiling starts immediately after receiving /start_profile. + """ + + max_iterations: int = Field(default=0, ge=0) + """Maximum number of engine iterations to profile after starting profiling. + Defaults to 0, meaning no limit. + """ + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + # no factors to consider. + # this config will not affect the computation graph. + factors: list[Any] = [] + hash_str = safe_hash(str(factors).encode(), usedforsecurity=False).hexdigest() + return hash_str + + def _get_from_env_if_set(self, field_name: str, env_var_name: str) -> None: + """Get field from env var if set, with deprecation warning.""" + + if envs.is_set(env_var_name): + value = getattr(envs, env_var_name) + logger.warning_once( + "Using %s environment variable is deprecated and will be removed in " + "v0.14.0 or v1.0.0, whichever is soonest. Please use " + "--profiler-config.%s command line argument or " + "ProfilerConfig(%s=...) config field instead.", + env_var_name, + field_name, + field_name, + ) + return value + return None + + def _set_from_env_if_set( + self, + field_name: str, + env_var_name: str, + to_bool: bool = True, + to_int: bool = False, + ) -> None: + """Set field from env var if set, with deprecation warning.""" + value = self._get_from_env_if_set(field_name, env_var_name) + if value is not None: + if to_bool: + value = value == "1" + if to_int: + value = int(value) + setattr(self, field_name, value) + + @model_validator(mode="after") + def _validate_profiler_config(self) -> Self: + maybe_use_cuda_profiler = self._get_from_env_if_set( + "profiler", "VLLM_TORCH_CUDA_PROFILE" + ) + if maybe_use_cuda_profiler is not None: + self.profiler = "cuda" if maybe_use_cuda_profiler == "1" else None + else: + self._set_from_env_if_set( + "torch_profiler_dir", "VLLM_TORCH_PROFILER_DIR", to_bool=False + ) + if self.torch_profiler_dir: + self.profiler = "torch" + self._set_from_env_if_set( + "torch_profiler_record_shapes", + "VLLM_TORCH_PROFILER_RECORD_SHAPES", + ) + self._set_from_env_if_set( + "torch_profiler_with_memory", + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", + ) + self._set_from_env_if_set( + "torch_profiler_with_stack", + "VLLM_TORCH_PROFILER_WITH_STACK", + ) + self._set_from_env_if_set( + "torch_profiler_with_flops", + "VLLM_TORCH_PROFILER_WITH_FLOPS", + ) + self._set_from_env_if_set( + "ignore_frontend", + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", + ) + self._set_from_env_if_set( + "torch_profiler_use_gzip", + "VLLM_TORCH_PROFILER_USE_GZIP", + ) + self._set_from_env_if_set( + "torch_profiler_dump_cuda_time_total", + "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", + ) + + self._set_from_env_if_set( + "delay_iterations", "VLLM_PROFILER_DELAY_ITERS", to_bool=False, to_int=True + ) + self._set_from_env_if_set( + "max_iterations", "VLLM_PROFILER_MAX_ITERS", to_bool=False, to_int=True + ) + + has_delay_or_limit = self.delay_iterations > 0 or self.max_iterations > 0 + if self.profiler == "torch" and has_delay_or_limit and not self.ignore_frontend: + logger.warning_once( + "Using 'torch' profiler with delay_iterations or max_iterations " + "while ignore_frontend is False may result in high overhead." + ) + + profiler_dir = self.torch_profiler_dir + if profiler_dir and self.profiler != "torch": + raise ValueError( + "torch_profiler_dir is only applicable when profiler is set to 'torch'" + ) + if self.profiler == "torch" and not profiler_dir: + raise ValueError("torch_profiler_dir must be set when profiler is 'torch'") + + if profiler_dir: + is_gs_path = ( + profiler_dir.startswith("gs://") + and profiler_dir[5:] + and profiler_dir[5] != "/" + ) + if not is_gs_path: + self.torch_profiler_dir = os.path.abspath( + os.path.expanduser(profiler_dir) + ) + + return self diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index a74413536407b..0e75daf0d722c 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -39,6 +39,7 @@ from .lora import LoRAConfig from .model import ModelConfig from .observability import ObservabilityConfig from .parallel import ParallelConfig +from .profiler import ProfilerConfig from .scheduler import SchedulerConfig from .speculative import SpeculativeConfig from .structured_outputs import StructuredOutputsConfig @@ -218,6 +219,8 @@ class VllmConfig: You can specify the full compilation config like so: `{"mode": 3, "cudagraph_capture_sizes": [1, 2, 4, 8]}` """ + profiler_config: ProfilerConfig = Field(default_factory=ProfilerConfig) + """Profiling configuration.""" kv_transfer_config: KVTransferConfig | None = None """The configurations for distributed KV cache transfer.""" kv_events_config: KVEventsConfig | None = None @@ -296,6 +299,8 @@ class VllmConfig: vllm_factors.append("None") if self.structured_outputs_config: vllm_factors.append(self.structured_outputs_config.compute_hash()) + if self.profiler_config: + vllm_factors.append(self.profiler_config.compute_hash()) else: vllm_factors.append("None") vllm_factors.append(self.observability_config.compute_hash()) @@ -661,8 +666,9 @@ class VllmConfig: default_config = OPTIMIZATION_LEVEL_TO_CONFIG[self.optimization_level] self._apply_optimization_level_defaults(default_config) + if ( - self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + self.compilation_config.cudagraph_mode.requires_piecewise_compilation() and self.compilation_config.mode != CompilationMode.VLLM_COMPILE ): logger.info( @@ -687,22 +693,29 @@ class VllmConfig: if current_platform.support_static_graph_mode(): # if cudagraph_mode has full cudagraphs, we need to check support - if ( - self.compilation_config.cudagraph_mode.has_full_cudagraphs() - and self.model_config is not None - ): - if self.model_config.pooler_config is not None: + if model_config := self.model_config: + if ( + self.compilation_config.cudagraph_mode.has_full_cudagraphs() + and model_config.pooler_config is not None + ): logger.warning_once( "Pooling models do not support full cudagraphs. " "Overriding cudagraph_mode to PIECEWISE." ) self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - elif self.model_config.is_encoder_decoder: - logger.warning_once( - "Encoder-decoder models do not support full cudagraphs. " - "Overriding cudagraph_mode to PIECEWISE." + elif ( + model_config.is_encoder_decoder + and self.compilation_config.cudagraph_mode + not in (CUDAGraphMode.NONE, CUDAGraphMode.FULL_DECODE_ONLY) + ): + logger.info_once( + "Encoder-decoder models do not support %s. " + "Overriding cudagraph_mode to FULL_DECODE_ONLY.", + self.compilation_config.cudagraph_mode.name, + ) + self.compilation_config.cudagraph_mode = ( + CUDAGraphMode.FULL_DECODE_ONLY ) - self.compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE # disable cudagraph when enforce eager execution if self.model_config is not None and self.model_config.enforce_eager: @@ -807,11 +820,6 @@ class VllmConfig: f"({self.parallel_config.cp_kv_cache_interleave_size})." ) - assert ( - self.parallel_config.cp_kv_cache_interleave_size == 1 - or self.speculative_config is None - ), "MTP with cp_kv_cache_interleave_size > 1 is not supported now." - # Do this after all the updates to compilation_config.mode self.compilation_config.set_splitting_ops_for_v1( all2all_backend=self.parallel_config.all2all_backend, @@ -1001,7 +1009,7 @@ class VllmConfig: max_graph_size = min(max_num_seqs * 2, 512) # 1, 2, 4, then multiples of 8 up to 256 and then multiples of 16 # up to max_graph_size - cuda_graph_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( + cudagraph_capture_sizes = [1, 2, 4] + list(range(8, 256, 8)) + list( range(256, max_graph_size + 1, 16)) In the end, `vllm_config.compilation_config.cudagraph_capture_sizes` @@ -1042,8 +1050,14 @@ class VllmConfig: self.compilation_config.max_cudagraph_capture_size ) if max_cudagraph_capture_size is None: + decode_query_len = 1 + if ( + self.speculative_config + and self.speculative_config.num_speculative_tokens + ): + decode_query_len += self.speculative_config.num_speculative_tokens max_cudagraph_capture_size = min( - self.scheduler_config.max_num_seqs * 2, 512 + self.scheduler_config.max_num_seqs * decode_query_len * 2, 512 ) max_num_tokens = self.scheduler_config.max_num_batched_tokens max_cudagraph_capture_size = min(max_num_tokens, max_cudagraph_capture_size) diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py index 2e878eef908ac..cd9c267beb5b5 100644 --- a/vllm/distributed/device_communicators/cuda_communicator.py +++ b/vllm/distributed/device_communicators/cuda_communicator.py @@ -225,7 +225,7 @@ class CudaCommunicator(DeviceCommunicatorBase): output_shape, dtype=input_tensor.dtype, device=input_tensor.device ) - if sizes is not None: + if sizes is not None and sizes.count(sizes[0]) != len(sizes): pynccl_comm.reduce_scatterv(output, input_tensor, sizes=sizes) else: pynccl_comm.reduce_scatter(output, input_tensor) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 114516ff07a1f..31c6084c9b507 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools import pickle +import threading import time from contextlib import contextmanager from dataclasses import dataclass, field @@ -43,6 +44,33 @@ VLLM_RINGBUFFER_WARNING_INTERVAL = envs.VLLM_RINGBUFFER_WARNING_INTERVAL from_bytes_big = functools.partial(int.from_bytes, byteorder="big") +# Memory fence for cross-process shared memory visibility. +# Required for correct producer-consumer synchronization when using +# shared memory without locks. +_memory_fence_lock = threading.Lock() + + +def memory_fence(): + """ + Full memory barrier for shared memory synchronization. + + Ensures all prior memory writes are visible to other processes before + any subsequent reads. This is critical for lock-free producer-consumer + patterns using shared memory. + + Implementation acquires and immediately releases a lock. Python's + threading.Lock provides sequentially consistent memory barrier semantics + across all major platforms (POSIX, Windows). This is a lightweight + operation (~20ns) that guarantees: + - All stores before the barrier are visible to other threads/processes + - All loads after the barrier see the latest values + """ + # Lock acquire/release provides full memory barrier semantics. + # Using context manager ensures lock release even on exceptions. + with _memory_fence_lock: + pass + + def to_bytes_big(value: int, size: int) -> bytes: return value.to_bytes(size, byteorder="big") @@ -414,6 +442,10 @@ class MessageQueue: n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + # Memory fence ensures we see the latest read flags from readers. + # Without this, we may read stale flags from our CPU cache and + # spin indefinitely even though readers have completed. + memory_fence() read_count = sum(metadata_buffer[1:]) written_flag = metadata_buffer[0] if written_flag and read_count != self.buffer.n_reader: @@ -458,6 +490,10 @@ class MessageQueue: metadata_buffer[i] = 0 # mark the block as written metadata_buffer[0] = 1 + # Memory fence ensures the write is visible to readers on other cores + # before we proceed. Without this, readers may spin indefinitely + # waiting for a write that's stuck in our CPU's store buffer. + memory_fence() self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks break @@ -473,6 +509,10 @@ class MessageQueue: n_warning = 1 while True: with self.buffer.get_metadata(self.current_idx) as metadata_buffer: + # Memory fence ensures we see the latest writes from the writer. + # Without this, we may read stale flags from our CPU cache + # and spin indefinitely even though writer has updated them. + memory_fence() read_flag = metadata_buffer[self.local_reader_rank + 1] written_flag = metadata_buffer[0] if not written_flag or read_flag: @@ -513,6 +553,10 @@ class MessageQueue: # caller has read from the buffer # set the read flag metadata_buffer[self.local_reader_rank + 1] = 1 + # Memory fence ensures the read flag is visible to the writer. + # Without this, writer may not see our read completion and + # could wait indefinitely for all readers to finish. + memory_fence() self.current_idx = (self.current_idx + 1) % self.buffer.max_chunks self._read_spin_timer.record_activity() diff --git a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py b/vllm/distributed/ec_transfer/ec_connector/example_connector.py similarity index 96% rename from vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py rename to vllm/distributed/ec_transfer/ec_connector/example_connector.py index c8388141dcc97..5f2eff5a8e6a8 100644 --- a/vllm/distributed/ec_transfer/ec_connector/shared_storage_connector.py +++ b/vllm/distributed/ec_transfer/ec_connector/example_connector.py @@ -32,7 +32,7 @@ class MMMeta: @dataclass -class ECSharedStorageConnectorMetadata(ECConnectorMetadata): +class ECExampleConnectorMetadata(ECConnectorMetadata): mm_datas: list[MMMeta] def __init__(self): @@ -42,7 +42,7 @@ class ECSharedStorageConnectorMetadata(ECConnectorMetadata): self.mm_datas.append(mm_data) -class ECSharedStorageConnector(ECConnectorBase): +class ECExampleConnector(ECConnectorBase): # NOTE: This is Simple debug implementation of the EC connector. # It save / load the EC cache to / from the disk. @@ -76,7 +76,7 @@ class ECSharedStorageConnector(ECConnectorBase): # Get the metadata metadata: ECConnectorMetadata = self._get_connector_metadata() - assert isinstance(metadata, ECSharedStorageConnectorMetadata) + assert isinstance(metadata, ECExampleConnectorMetadata) assert encoder_cache is not None if metadata is None: logger.warning( @@ -160,7 +160,7 @@ class ECSharedStorageConnector(ECConnectorBase): Args: scheduler_output (SchedulerOutput): the scheduler output object. """ - meta = ECSharedStorageConnectorMetadata() + meta = ECExampleConnectorMetadata() for mm_hash, num_encoder_token in self._mm_datas_need_loads.items(): meta.add_mm_data(MMMeta.make_meta(mm_hash, num_encoder_token)) self._mm_datas_need_loads.clear() diff --git a/vllm/distributed/ec_transfer/ec_connector/factory.py b/vllm/distributed/ec_transfer/ec_connector/factory.py index e51b32e6f6dff..32f36ffbb14d2 100644 --- a/vllm/distributed/ec_transfer/ec_connector/factory.py +++ b/vllm/distributed/ec_transfer/ec_connector/factory.py @@ -79,7 +79,7 @@ class ECConnectorFactory: # only load the files corresponding to the current connector. ECConnectorFactory.register_connector( - "ECSharedStorageConnector", - "vllm.distributed.ec_transfer.ec_connector.shared_storage_connector", - "ECSharedStorageConnector", + "ECExampleConnector", + "vllm.distributed.ec_transfer.ec_connector.example_connector", + "ECExampleConnector", ) diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 55856d940f001..376dad8a72ef1 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -322,6 +322,9 @@ async def transfer_layer( num_local_physical_experts = next(iter(expert_weights[0])).shape[0] assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) assert num_physical_experts == ep_size * num_local_physical_experts + # A buffer to hold the expert weights in one layer during the exchange. + # NOTE: Currently we assume the same weights across different layers + # have the same shape. is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer( num_local_experts=num_local_physical_experts, diff --git a/vllm/distributed/kv_events.py b/vllm/distributed/kv_events.py index 7b5cb94cf13ea..3b76af75504de 100644 --- a/vllm/distributed/kv_events.py +++ b/vllm/distributed/kv_events.py @@ -5,7 +5,7 @@ import queue import threading import time from abc import ABC, abstractmethod -from collections import deque +from collections import Counter, deque from collections.abc import Callable from dataclasses import asdict from itertools import count @@ -54,11 +54,26 @@ class BlockStored(KVCacheEvent): lora_id: int | None medium: str | None + def __hash__(self) -> int: + return hash( + ( + tuple(self.block_hashes), + self.parent_block_hash, + tuple(self.token_ids), + self.block_size, + self.lora_id, + self.medium, + ) + ) + class BlockRemoved(KVCacheEvent): block_hashes: list[ExternalBlockHash] medium: str | None + def __hash__(self) -> int: + return hash((tuple(self.block_hashes), self.medium)) + class AllBlocksCleared(KVCacheEvent): pass @@ -68,6 +83,119 @@ class KVEventBatch(EventBatch): events: list[BlockStored | BlockRemoved | AllBlocksCleared] +class KVEventAggregator: + """ + Aggregates KV events across multiple workers. + Tracks how many times each event appears and returns only those + that were emitted by all workers. + """ + + __slots__ = ("_event_counter", "_num_workers") + + def __init__(self, num_workers: int) -> None: + if num_workers <= 0: + raise ValueError("num_workers must be greater than zero.") + self._event_counter: Counter[KVCacheEvent] = Counter() + self._num_workers: int = num_workers + + def add_events(self, events: list[KVCacheEvent]) -> None: + """ + Add events from a worker batch. + + :param events: List of KVCacheEvent objects. + """ + if not isinstance(events, list): + raise TypeError("events must be a list of KVCacheEvent.") + self._event_counter.update(events) + + def get_common_events(self) -> list[KVCacheEvent]: + """ + Return events that appeared in all workers. + + :return: List of events present in all workers. + """ + return [ + event + for event, count in self._event_counter.items() + if count == self._num_workers + ] + + def get_all_events(self) -> list[KVCacheEvent]: + """ + Return all events for all workers. + + :return: List of events for all workers. + """ + return list(self._event_counter.elements()) + + def clear_events(self) -> None: + """ + Clear all tracked events. + """ + self._event_counter.clear() + + def increment_workers(self, count: int = 1) -> None: + """ + Increment the number of workers contributing events. + + :param count: Number to increment the workers by. + """ + if count <= 0: + raise ValueError("count must be positive.") + self._num_workers += count + + def reset_workers(self) -> None: + """ + Reset the number of workers to 1. + """ + self._num_workers = 1 + + def get_number_of_workers(self) -> int: + """ + Return the number of workers. + + :return: int number of workers. + """ + return self._num_workers + + def __repr__(self) -> str: + return ( + f"" + ) + + +class KVConnectorKVEvents(ABC): + """ + Abstract base class for KV events. + Acts as a container for KV events from the connector. + """ + + @abstractmethod + def add_events(self, events: list[KVCacheEvent]) -> None: + raise NotImplementedError + + @abstractmethod + def aggregate(self) -> "KVConnectorKVEvents": + raise NotImplementedError + + @abstractmethod + def increment_workers(self, count: int = 1) -> None: + raise NotImplementedError + + @abstractmethod + def get_all_events(self) -> list[KVCacheEvent]: + raise NotImplementedError + + @abstractmethod + def get_number_of_workers(self) -> int: + raise NotImplementedError + + @abstractmethod + def clear_events(self) -> None: + raise NotImplementedError + + class EventPublisher(ABC): """Lightweight publisher for EventBatch batches with data parallelism support. diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py index 02f51a1dce112..02d9a1ec9599e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py +++ b/vllm/distributed/kv_transfer/kv_connector/factory.py @@ -144,9 +144,9 @@ class KVConnectorFactory: # only load the files corresponding to the current connector. KVConnectorFactory.register_connector( - "SharedStorageConnector", - "vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector", - "SharedStorageConnector", + "ExampleConnector", + "vllm.distributed.kv_transfer.kv_connector.v1.example_connector", + "ExampleConnector", ) KVConnectorFactory.register_connector( diff --git a/vllm/distributed/kv_transfer/kv_connector/utils.py b/vllm/distributed/kv_transfer/kv_connector/utils.py index 99d3be57c1381..117d159e25e71 100644 --- a/vllm/distributed/kv_transfer/kv_connector/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/utils.py @@ -78,6 +78,7 @@ class KVOutputAggregator: finished_sending = set[str]() finished_recving = set[str]() aggregated_kv_connector_stats = None + combined_kv_cache_events = None invalid_block_ids = set[int]() for model_runner_output in outputs: assert model_runner_output is not None @@ -119,6 +120,19 @@ class KVOutputAggregator: aggregated_kv_connector_stats.aggregate(kv_connector_stats) ) + # Combine kv_cache_events from all workers. + if combined_kv_cache_events is None: + # Use the first worker's kv_cache events as start event list. + combined_kv_cache_events = kv_output.kv_cache_events + elif kv_cache_events := kv_output.kv_cache_events: + assert isinstance( + combined_kv_cache_events, + type(kv_cache_events), + ) + worker_kv_cache_events = kv_cache_events.get_all_events() + combined_kv_cache_events.add_events(worker_kv_cache_events) + combined_kv_cache_events.increment_workers(1) + invalid_block_ids |= kv_output.invalid_block_ids # select output of the worker specified by output_rank @@ -129,6 +143,7 @@ class KVOutputAggregator: finished_sending=finished_sending or None, finished_recving=finished_recving or None, kv_connector_stats=aggregated_kv_connector_stats or None, + kv_cache_events=combined_kv_cache_events or None, invalid_block_ids=invalid_block_ids, expected_finished_count=self._expected_finished_count, ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py index 91f6443f92cbe..c05e5485a835e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py @@ -49,7 +49,7 @@ from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.config import VllmConfig - from vllm.distributed.kv_events import KVCacheEvent + from vllm.distributed.kv_events import KVCacheEvent, KVConnectorKVEvents from vllm.distributed.kv_transfer.kv_connector.v1.metrics import ( KVConnectorPromMetrics, KVConnectorStats, @@ -379,6 +379,14 @@ class KVConnectorBase_V1(ABC): """ return None + def get_kv_connector_kv_cache_events(self) -> Optional["KVConnectorKVEvents"]: + """ + Get the KV connector kv cache events collected during the last interval. + This function should be called by the model runner every time after the + model execution and before cleanup. + """ + return None + def get_handshake_metadata(self) -> KVConnectorHandshakeMetadata | None: """ Get the KVConnector handshake metadata for this connector. diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py similarity index 98% rename from vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py rename to vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py index ed641cfc43ddd..41243fc866b59 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/shared_storage_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/example_connector.py @@ -65,7 +65,7 @@ class ReqMeta: @dataclass -class SharedStorageConnectorMetadata(KVConnectorMetadata): +class ExampleConnectorMetadata(KVConnectorMetadata): requests: list[ReqMeta] = field(default_factory=list) def add_request( @@ -81,7 +81,7 @@ class SharedStorageConnectorMetadata(KVConnectorMetadata): ) -class SharedStorageConnector(KVConnectorBase_V1): +class ExampleConnector(KVConnectorBase_V1): # NOTE: This is Simple debug implementation of the KV connector. # It save / load the KV cache to / from the disk. # It does extra work which will overwrite the existing prefix-cache in GPU @@ -157,7 +157,7 @@ class SharedStorageConnector(KVConnectorBase_V1): # Get the metadata metadata: KVConnectorMetadata = self._get_connector_metadata() - assert isinstance(metadata, SharedStorageConnectorMetadata) + assert isinstance(metadata, ExampleConnectorMetadata) if metadata is None: logger.warning( @@ -241,7 +241,7 @@ class SharedStorageConnector(KVConnectorBase_V1): return layer.reshape(2, num_pages * page_size, -1)[:, slot_mapping, ...] connector_metadata = self._get_connector_metadata() - assert isinstance(connector_metadata, SharedStorageConnectorMetadata) + assert isinstance(connector_metadata, ExampleConnectorMetadata) for request in connector_metadata.requests: if request.is_store: filename = self._generate_filename_debug( @@ -315,7 +315,7 @@ class SharedStorageConnector(KVConnectorBase_V1): Args: scheduler_output (SchedulerOutput): the scheduler output object. """ - meta = SharedStorageConnectorMetadata() + meta = ExampleConnectorMetadata() total_need_load = 0 for new_req in scheduler_output.scheduled_new_reqs: diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py index 30da424ddcca0..17d468fe6c305 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_connector.py @@ -1,14 +1,18 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from collections.abc import Iterable from typing import TYPE_CHECKING, Any import torch -from lmcache.integration.vllm.vllm_v1_adapter import ( - LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl, -) from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig +from vllm.distributed.kv_events import ( + BlockStored, + KVCacheEvent, + KVConnectorKVEvents, + KVEventAggregator, +) from vllm.distributed.kv_transfer.kv_connector.v1.base import ( KVConnectorBase_V1, KVConnectorMetadata, @@ -16,6 +20,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.base import ( ) from vllm.logger import init_logger from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.outputs import KVConnectorOutput if TYPE_CHECKING: from vllm.forward_context import ForwardContext @@ -26,6 +31,44 @@ if TYPE_CHECKING: logger = init_logger(__name__) +class LMCacheKVEvents(KVConnectorKVEvents): + """ + Concrete implementation of KVConnectorKVEvents using KVEventAggregator. + """ + + def __init__(self, num_workers: int) -> None: + self._aggregator = KVEventAggregator(num_workers) + + def add_events(self, events: list[KVCacheEvent]) -> None: + self._aggregator.add_events(events) + + def aggregate(self) -> "LMCacheKVEvents": + """ + Aggregate KV events and retain only common events. + """ + common_events = self._aggregator.get_common_events() + self._aggregator.clear_events() + self._aggregator.add_events(common_events) + self._aggregator.reset_workers() + return self + + def increment_workers(self, count: int = 1) -> None: + self._aggregator.increment_workers(count) + + def get_all_events(self) -> list[KVCacheEvent]: + return self._aggregator.get_all_events() + + def get_number_of_workers(self) -> int: + return self._aggregator.get_number_of_workers() + + def clear_events(self) -> None: + self._aggregator.clear_events() + self._aggregator.reset_workers() + + def __repr__(self) -> str: + return f"" + + class LMCacheConnectorV1(KVConnectorBase_V1): def __init__( self, @@ -50,10 +93,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1): cls = _adapter.LMCacheConnectorV1Impl else: logger.info("Initializing latest dev LMCache connector") + # lazy import + from lmcache.integration.vllm.vllm_v1_adapter import ( + LMCacheConnectorV1Impl as LMCacheConnectorLatestImpl, + ) + cls = LMCacheConnectorLatestImpl self._lmcache_engine = cls(vllm_config, role, self) + self._kv_cache_events: LMCacheKVEvents | None = None + # ============================== # Worker-side methods # ============================== @@ -151,6 +201,31 @@ class LMCacheConnectorV1(KVConnectorBase_V1): # Fallback for older versions that don't support this method return set() + def get_kv_connector_kv_cache_events(self) -> LMCacheKVEvents | None: + """ + Get the KV connector kv cache events collected during the last interval. + """ + + events = self._lmcache_engine.get_kv_events() # type: ignore [attr-defined] + if not events: + return None + + blocks: list[BlockStored] = [ + BlockStored( + block_hashes=e.block_hashes, + parent_block_hash=e.parent_block_hash, + token_ids=e.token_ids, + lora_id=e.lora_id, + block_size=e.block_size, + medium=e.medium, + ) + for e in events + ] + + lmcache_kv_events = LMCacheKVEvents(num_workers=1) + lmcache_kv_events.add_events(blocks) + return lmcache_kv_events + # ============================== # Scheduler-side methods # ============================== @@ -198,6 +273,28 @@ class LMCacheConnectorV1(KVConnectorBase_V1): """ return self._lmcache_engine.build_connector_meta(scheduler_output) + def update_connector_output(self, connector_output: KVConnectorOutput): + """ + Update KVConnector state from worker-side connectors output. + + Args: + connector_output (KVConnectorOutput): the worker-side + connectors output. + """ + # Get the KV events + kv_cache_events = connector_output.kv_cache_events + if not kv_cache_events or not isinstance(kv_cache_events, LMCacheKVEvents): + return + + if self._kv_cache_events is None: + self._kv_cache_events = kv_cache_events + else: + self._kv_cache_events.add_events(kv_cache_events.get_all_events()) + self._kv_cache_events.increment_workers( + kv_cache_events.get_number_of_workers() + ) + return + def request_finished( self, request: "Request", @@ -214,3 +311,17 @@ class LMCacheConnectorV1(KVConnectorBase_V1): returned by the engine. """ return self._lmcache_engine.request_finished(request, block_ids) + + def take_events(self) -> Iterable["KVCacheEvent"]: + """ + Take the KV cache events from the connector. + + Yields: + New KV cache events since the last call. + """ + if self._kv_cache_events is not None: + self._kv_cache_events.aggregate() + kv_cache_events = self._kv_cache_events.get_all_events() + yield from kv_cache_events + self._kv_cache_events.clear_events() + self._kv_cache_events = None diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py index 15ac5b049fce9..cdc2969a7735e 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/vllm_v1_adapter.py @@ -27,7 +27,7 @@ from lmcache.v1.lookup_client.lmcache_async_lookup_client import ( LMCacheAsyncLookupServer, ) from lmcache.v1.offload_server.zmq_server import ZMQOffloadServer -from lmcache.v1.plugin.plugin_launcher import PluginLauncher +from lmcache.v1.plugin.runtime_plugin_launcher import RuntimePluginLauncher from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import VllmConfig @@ -683,7 +683,7 @@ class LMCacheConnectorV1Impl: self.api_server = InternalAPIServer(self) self.api_server.start() # Launch plugins - self.plugin_launcher = PluginLauncher( + self.plugin_launcher = RuntimePluginLauncher( self.config, role, self.worker_count, diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py index c80dc1a567fdb..6825745374959 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py @@ -259,6 +259,12 @@ class MultiConnector(KVConnectorBase_V1): agg_block_ids |= c.get_block_ids_with_load_errors() return agg_block_ids + # TODO: Add a generic implementation of 'get_kv_connector_kv_cache_events' method + # for the MultiConnector. It should be able to get events from multiple + # connectors, handling the case where only a subset of the requested connectors + # implements the 'get_kv_connector_kv_cache_events' + # Follow on PR from https://github.com/vllm-project/vllm/pull/28309#pullrequestreview-3566351082 + # ============================== # Scheduler-side methods # ============================== diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f910f10407d44..338cb1f1814b5 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1586,6 +1586,8 @@ def destroy_distributed_environment(): def cleanup_dist_env_and_memory(shutdown_ray: bool = False): + # Reset environment variable cache + envs.disable_envs_cache() # Ensure all objects are not frozen before cleanup gc.unfreeze() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ceac5407af6e2..757023e12d439 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -50,6 +50,7 @@ from vllm.config import ( ObservabilityConfig, ParallelConfig, PoolerConfig, + ProfilerConfig, SchedulerConfig, SpeculativeConfig, StructuredOutputsConfig, @@ -70,7 +71,6 @@ from vllm.config.model import ( LogprobsMode, ModelDType, RunnerOption, - TaskOption, TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode @@ -359,7 +359,6 @@ class EngineArgs: hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert - task: TaskOption | None = ModelConfig.task skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode @@ -372,9 +371,8 @@ class EngineArgs: config_format: str = ModelConfig.config_format dtype: ModelDType = ModelConfig.dtype kv_cache_dtype: CacheDType = CacheConfig.cache_dtype - seed: int | None = 0 + seed: int = ModelConfig.seed max_model_len: int | None = ModelConfig.max_model_len - cuda_graph_sizes: list[int] | None = CompilationConfig.cudagraph_capture_sizes cudagraph_capture_sizes: list[int] | None = ( CompilationConfig.cudagraph_capture_sizes ) @@ -462,7 +460,6 @@ class EngineArgs: MultiModalConfig, "media_io_kwargs" ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs - disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb mm_processor_cache_type: MMCacheType | None = ( MultiModalConfig.mm_processor_cache_type @@ -532,6 +529,8 @@ class EngineArgs: worker_cls: str = ParallelConfig.worker_cls worker_extension_cls: str = ParallelConfig.worker_extension_cls + profiler_config: ProfilerConfig = get_field(VllmConfig, "profiler_config") + kv_transfer_config: KVTransferConfig | None = None kv_events_config: KVEventsConfig | None = None @@ -556,9 +555,6 @@ class EngineArgs: use_tqdm_on_load: bool = LoadConfig.use_tqdm_on_load pt_load_map_location: str = LoadConfig.pt_load_map_location - # DEPRECATED - enable_multimodal_encoder_data_parallel: bool = False - logits_processors: list[str | type[LogitsProcessor]] | None = ( ModelConfig.logits_processors ) @@ -626,7 +622,6 @@ class EngineArgs: model_group.add_argument("--model", **model_kwargs["model"]) model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) - model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( @@ -880,11 +875,6 @@ class EngineArgs: parallel_group.add_argument( "--worker-extension-cls", **parallel_kwargs["worker_extension_cls"] ) - parallel_group.add_argument( - "--enable-multimodal-encoder-data-parallel", - action="store_true", - deprecated=True, - ) # KV cache arguments cache_kwargs = get_kwargs(CacheConfig) @@ -958,9 +948,6 @@ class EngineArgs: multimodal_group.add_argument( "--mm-processor-cache-gb", **multimodal_kwargs["mm_processor_cache_gb"] ) - multimodal_group.add_argument( - "--disable-mm-preprocessor-cache", action="store_true", deprecated=True - ) multimodal_group.add_argument( "--mm-processor-cache-type", **multimodal_kwargs["mm_processor_cache_type"] ) @@ -1118,15 +1105,6 @@ class EngineArgs: compilation_group.add_argument( "--cudagraph-capture-sizes", **compilation_kwargs["cudagraph_capture_sizes"] ) - compilation_kwargs["cudagraph_capture_sizes"]["help"] = ( - "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or v1.0.0," - " whichever is soonest. Please use --cudagraph-capture-sizes instead." - ) - compilation_group.add_argument( - "--cuda-graph-sizes", - **compilation_kwargs["cudagraph_capture_sizes"], - deprecated=True, - ) compilation_group.add_argument( "--max-cudagraph-capture-size", **compilation_kwargs["max_cudagraph_capture_size"], @@ -1164,7 +1142,7 @@ class EngineArgs: vllm_group.add_argument( "--structured-outputs-config", **vllm_kwargs["structured_outputs_config"] ) - + vllm_group.add_argument("--profiler-config", **vllm_kwargs["profiler_config"]) vllm_group.add_argument( "--optimization-level", **vllm_kwargs["optimization_level"] ) @@ -1199,62 +1177,20 @@ class EngineArgs: if is_gguf(self.model): self.quantization = self.load_format = "gguf" - # NOTE(woosuk): In V1, we use separate processes for workers (unless - # VLLM_ENABLE_V1_MULTIPROCESSING=0), so setting a seed here - # doesn't affect the user process. - if self.seed is None: - logger.warning_once( - "`seed=None` is equivalent to `seed=0` in V1 Engine. " - "You will no longer be allowed to pass `None` in v0.13.", - scope="local", + if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: + logger.warning( + "The global random seed is set to %d. Since " + "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " + "affect the random state of the Python process that " + "launched vLLM.", + self.seed, ) - self.seed = 0 - if not envs.VLLM_ENABLE_V1_MULTIPROCESSING: - logger.warning( - "The global random seed is set to %d. Since " - "VLLM_ENABLE_V1_MULTIPROCESSING is set to False, this may " - "affect the random state of the Python process that " - "launched vLLM.", - self.seed, - ) - - if self.disable_mm_preprocessor_cache: - logger.warning_once( - "`--disable-mm-preprocessor-cache` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb 0` instead.", - scope="local", - ) - - self.mm_processor_cache_gb = 0 - elif envs.VLLM_MM_INPUT_CACHE_GIB != 4: - logger.warning_once( - "VLLM_MM_INPUT_CACHE_GIB` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-processor-cache-gb %d` instead.", - envs.VLLM_MM_INPUT_CACHE_GIB, - scope="local", - ) - - self.mm_processor_cache_gb = envs.VLLM_MM_INPUT_CACHE_GIB - - if self.enable_multimodal_encoder_data_parallel: - logger.warning_once( - "--enable-multimodal-encoder-data-parallel` is deprecated " - "and will be removed in v0.13. " - "Please use `--mm-encoder-tp-mode data` instead.", - scope="local", - ) - - self.mm_encoder_tp_mode = "data" - return ModelConfig( model=self.model, hf_config_path=self.hf_config_path, runner=self.runner, convert=self.convert, - task=self.task, tokenizer=self.tokenizer, tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, @@ -1738,18 +1674,6 @@ class EngineArgs: # Compilation config overrides compilation_config = copy.deepcopy(self.compilation_config) - if self.cuda_graph_sizes is not None: - logger.warning( - "--cuda-graph-sizes is deprecated and will be removed in v0.13.0 or " - "v1.0.0, whichever is soonest. Please use --cudagraph-capture-sizes " - "instead." - ) - if compilation_config.cudagraph_capture_sizes is not None: - raise ValueError( - "cuda_graph_sizes and compilation_config." - "cudagraph_capture_sizes are mutually exclusive" - ) - compilation_config.cudagraph_capture_sizes = self.cuda_graph_sizes if self.cudagraph_capture_sizes is not None: if compilation_config.cudagraph_capture_sizes is not None: raise ValueError( @@ -1782,6 +1706,7 @@ class EngineArgs: kv_transfer_config=self.kv_transfer_config, kv_events_config=self.kv_events_config, ec_transfer_config=self.ec_transfer_config, + profiler_config=self.profiler_config, additional_config=self.additional_config, optimization_level=self.optimization_level, ) @@ -1858,6 +1783,7 @@ class EngineArgs: except Exception: # This is only used to set default_max_num_batched_tokens device_memory = 0 + device_name = "" # NOTE(Kuntai): Setting large `max_num_batched_tokens` for A100 reduces # throughput, see PR #17885 for more details. @@ -1922,16 +1848,6 @@ class EngineArgs: default_chunked_prefill = model_config.is_chunked_prefill_supported default_prefix_caching = model_config.is_prefix_caching_supported - if self.prefill_context_parallel_size > 1: - default_chunked_prefill = False - default_prefix_caching = False - logger.warning_once( - "--prefill-context-parallel-size > 1 is not compatible with " - "chunked prefill and prefix caching now. Chunked prefill " - "and prefix caching have been disabled by default.", - scope="local", - ) - if self.enable_chunked_prefill is None: self.enable_chunked_prefill = default_chunked_prefill @@ -2117,11 +2033,13 @@ def human_readable_int(value): "k": 10**3, "m": 10**6, "g": 10**9, + "t": 10**12, } binary_multiplier = { "K": 2**10, "M": 2**20, "G": 2**30, + "T": 2**40, } number, suffix = match.groups() diff --git a/vllm/entrypoints/constants.py b/vllm/entrypoints/constants.py index b5bcccc35d6c8..5726ee0735d4c 100644 --- a/vllm/entrypoints/constants.py +++ b/vllm/entrypoints/constants.py @@ -8,3 +8,5 @@ Shared constants for vLLM entrypoints. # These constants help mitigate header abuse attacks H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT = 4194304 # 4 MB H11_MAX_HEADER_COUNT_DEFAULT = 256 + +MCP_PREFIX = "mcp_" diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 01ddab473723b..c70eaaa082fe5 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -19,6 +19,7 @@ from vllm import envs from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ) +from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.openai.parser.harmony_utils import ( get_encoding, get_streamable_parser_for_assistant, @@ -303,7 +304,7 @@ class ParsableContext(ConversationContext): result_str = result.content[0].text message = ResponseFunctionToolCallOutputItem( - id=f"fco_{random_uuid()}", + id=f"mcpo_{random_uuid()}", type="function_call_output", call_id=f"call_{random_uuid()}", output=result_str, @@ -385,6 +386,9 @@ class ParsableContext(ConversationContext): if not self.parser.response_messages: return [] last_msg = self.parser.response_messages[-1] + # change this to a mcp_ function call + last_msg.id = f"{MCP_PREFIX}{random_uuid()}" + self.parser.response_messages[-1] = last_msg if last_msg.name == "code_interpreter": return await self.call_python_tool(self._tool_sessions["python"], last_msg) elif last_msg.name == "web_search_preview": diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 913324fd5f9c3..6440b702f4fa6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -9,7 +9,7 @@ import cloudpickle import torch.nn as nn from pydantic import ValidationError from tqdm.auto import tqdm -from typing_extensions import TypeVar, deprecated +from typing_extensions import TypeVar from vllm.beam_search import ( BeamSearchInstance, @@ -20,6 +20,7 @@ from vllm.beam_search import ( from vllm.config import ( CompilationConfig, PoolerConfig, + ProfilerConfig, StructuredOutputsConfig, is_init_field, ) @@ -72,7 +73,6 @@ from vllm.pooling_params import PoolingParams from vllm.sampling_params import BeamSearchParams, RequestOutputKind, SamplingParams from vllm.tasks import PoolingTask from vllm.tokenizers import MistralTokenizer, TokenizerLike -from vllm.tokenizers.hf import get_cached_tokenizer from vllm.usage.usage_lib import UsageContext from vllm.utils.collection_utils import as_iter, is_list_of from vllm.utils.counter import Counter @@ -198,7 +198,7 @@ class LLM: quantization: QuantizationMethods | None = None, revision: str | None = None, tokenizer_revision: str | None = None, - seed: int | None = None, + seed: int = 0, gpu_memory_utilization: float = 0.9, swap_space: float = 4, cpu_offload_gb: float = 0, @@ -211,6 +211,7 @@ class LLM: structured_outputs_config: dict[str, Any] | StructuredOutputsConfig | None = None, + profiler_config: dict[str, Any] | ProfilerConfig | None = None, kv_cache_memory_bytes: int | None = None, compilation_config: int | dict[str, Any] | CompilationConfig | None = None, logits_processors: list[str | type[LogitsProcessor]] | None = None, @@ -282,6 +283,20 @@ class LLM: else: structured_outputs_instance = StructuredOutputsConfig() + if profiler_config is not None: + if isinstance(profiler_config, dict): + profiler_config_instance = ProfilerConfig( + **{ + k: v + for k, v in profiler_config.items() + if is_init_field(ProfilerConfig, k) + } + ) + else: + profiler_config_instance = profiler_config + else: + profiler_config_instance = ProfilerConfig() + # warn about single-process data parallel usage. _dp_size = int(kwargs.get("data_parallel_size", 1)) _distributed_executor_backend = kwargs.get("distributed_executor_backend") @@ -324,6 +339,7 @@ class LLM: mm_processor_kwargs=mm_processor_kwargs, pooler_config=pooler_config, structured_outputs_config=structured_outputs_instance, + profiler_config=profiler_config_instance, compilation_config=compilation_config_instance, logits_processors=logits_processors, **kwargs, @@ -350,16 +366,6 @@ class LLM: def get_tokenizer(self) -> TokenizerLike: return self.llm_engine.get_tokenizer() - @deprecated("`set_tokenizer` is deprecated and will be removed in v0.13.") - def set_tokenizer(self, tokenizer: TokenizerLike) -> None: - # While CachedTokenizer is dynamic, have no choice but - # compare class name. Misjudgment will arise from - # user-defined tokenizer started with 'Cached' - if tokenizer.__class__.__name__.startswith("Cached"): - self.llm_engine.tokenizer = tokenizer - else: - self.llm_engine.tokenizer = get_cached_tokenizer(tokenizer) - def reset_mm_cache(self) -> None: self.input_processor.clear_mm_cache() self.llm_engine.reset_mm_cache() diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 946362ce2ef0a..b798b05dcfcbf 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -176,7 +176,7 @@ class FrontendArgs: enable_force_include_usage: bool = False """If set to True, including usage on every request.""" enable_tokenizer_info_endpoint: bool = False - """Enable the /get_tokenizer_info endpoint. May expose chat + """Enable the `/tokenizer_info` endpoint. May expose chat templates and other tokenizer configuration.""" enable_log_outputs: bool = False """If True, log model outputs (generations). diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index c6333d170c663..2560a5b2cdf41 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -51,7 +51,11 @@ from vllm.entrypoints.openai.protocol import ( ToolCall, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, + clamp_prompt_logprobs, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.openai.tool_parsers import ToolParser from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import MistralToolCall @@ -380,6 +384,8 @@ class OpenAIServingChat(OpenAIServing): tokenizer, request_metadata, ) + except GenerationError as e: + return self._convert_generation_error_to_response(e) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -1120,6 +1126,10 @@ class OpenAIServingChat(OpenAIServing): # if the model is finished generating else: + # check for error finish reason and abort streaming + # finish_reason='error' indicates a retryable error + self._raise_if_error(output.finish_reason, request_id) + # check to make sure we haven't "forgotten" to stream # any tokens that were generated but previously # matched by partial json parsing @@ -1287,6 +1297,8 @@ class OpenAIServingChat(OpenAIServing): delta=False, ) + except GenerationError as e: + yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: # TODO: Use a vllm-specific Validation Error logger.exception("Error in chat completion stream generator.") @@ -1327,6 +1339,9 @@ class OpenAIServingChat(OpenAIServing): role = self.get_chat_request_role(request) for output in final_res.outputs: + # check for error finish reason and raise GenerationError + # finish_reason='error' indicates a retryable request-level internal error + self._raise_if_error(output.finish_reason, request_id) token_ids = output.token_ids out_logprobs = output.logprobs tool_call_info = None diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 3e421e21e3e80..1be0afc8c74e5 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -24,7 +24,11 @@ from vllm.entrypoints.openai.protocol import ( RequestResponseMetadata, UsageInfo, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing, clamp_prompt_logprobs +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, + clamp_prompt_logprobs, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.renderer import RenderConfig from vllm.entrypoints.utils import get_max_tokens, should_include_usage @@ -300,6 +304,8 @@ class OpenAIServingCompletion(OpenAIServing): ) except asyncio.CancelledError: return self.create_error_response("Client disconnected") + except GenerationError as e: + return self._convert_generation_error_to_response(e) except ValueError as e: # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) @@ -437,6 +443,8 @@ class OpenAIServingCompletion(OpenAIServing): finish_reason = output.finish_reason stop_reason = output.stop_reason + self._raise_if_error(finish_reason, request_id) + chunk = CompletionStreamResponse( id=request_id, created=created_time, @@ -498,8 +506,11 @@ class OpenAIServingCompletion(OpenAIServing): # report to FastAPI middleware aggregate usage across all choices request_metadata.final_usage_info = final_usage_info + except GenerationError as e: + yield f"data: {self._convert_generation_error_to_streaming_response(e)}\n\n" except Exception as e: # TODO: Use a vllm-specific Validation Error + logger.exception("Error in completion stream generator.") data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" yield "data: [DONE]\n\n" @@ -530,6 +541,8 @@ class OpenAIServingCompletion(OpenAIServing): out_logprobs: GenericSequence[dict[int, Logprob] | None] | None for output in final_res.outputs: + self._raise_if_error(output.finish_reason, request_id) + assert request.max_tokens is not None if request.echo: if request.return_token_ids: diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 99936f588f28b..a799432baeb40 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -133,6 +133,15 @@ from vllm.utils.async_utils import ( from vllm.utils.collection_utils import is_list_of from vllm.v1.engine import EngineCoreRequest + +class GenerationError(Exception): + """raised when finish_reason indicates internal server error (500)""" + + def __init__(self, message: str = "Internal server error"): + super().__init__(message) + self.status_code = HTTPStatus.INTERNAL_SERVER_ERROR + + logger = init_logger(__name__) CompletionLikeRequest: TypeAlias = ( @@ -456,6 +465,29 @@ class OpenAIServing: # Iterate through all beam inference results for i, result in enumerate(output): current_beam = all_beams[i] + + # check for error finish reason and abort beam search + if result.outputs[0].finish_reason == "error": + # yield error output and terminate beam search + yield RequestOutput( + request_id=request_id, + prompt=prompt_text, + outputs=[ + CompletionOutput( + index=0, + text="", + token_ids=[], + cumulative_logprob=None, + logprobs=None, + finish_reason="error", + ) + ], + finished=True, + prompt_token_ids=prompt_token_ids, + prompt_logprobs=None, + ) + return + if result.outputs[0].logprobs is not None: logprobs = result.outputs[0].logprobs[0] all_beams_token_id.extend(list(logprobs.keys())) @@ -780,6 +812,35 @@ class OpenAIServing: ) return json_str + def _raise_if_error(self, finish_reason: str | None, request_id: str) -> None: + """Raise GenerationError if finish_reason indicates an error.""" + if finish_reason == "error": + logger.error( + "Request %s failed with an internal error during generation", + request_id, + ) + raise GenerationError("Internal server error") + + def _convert_generation_error_to_response( + self, e: GenerationError + ) -> ErrorResponse: + """Convert GenerationError to ErrorResponse.""" + return self.create_error_response( + str(e), + err_type="InternalServerError", + status_code=e.status_code, + ) + + def _convert_generation_error_to_streaming_response( + self, e: GenerationError + ) -> str: + """Convert GenerationError to streaming error response.""" + return self.create_streaming_error_response( + str(e), + err_type="InternalServerError", + status_code=e.status_code, + ) + async def _check_model( self, request: AnyRequest, @@ -1339,6 +1400,7 @@ class OpenAIServing: ) engine_prompt = engine_prompts[0] request_prompt = request_prompts[0] + prompt_text, _, _ = self._get_prompt_components(request_prompt) # Update the sampling params. sampling_params.max_tokens = self.max_model_len - len( diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 91616a78e11dc..60d14337dcaaf 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -50,6 +50,7 @@ from openai.types.responses.response_reasoning_item import ( ) from openai.types.responses.tool import Mcp, Tool from openai_harmony import Message as OpenAIHarmonyMessage +from pydantic import TypeAdapter from vllm import envs from vllm.engine.protocol import EngineClient @@ -94,7 +95,10 @@ from vllm.entrypoints.openai.protocol import ( ResponseUsage, StreamingResponsesResponse, ) -from vllm.entrypoints.openai.serving_engine import OpenAIServing +from vllm.entrypoints.openai.serving_engine import ( + GenerationError, + OpenAIServing, +) from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.entrypoints.responses_utils import ( construct_input_messages, @@ -541,6 +545,8 @@ class OpenAIServingResponses(OpenAIServing): tokenizer, request_metadata, ) + except GenerationError as e: + return self._convert_generation_error_to_response(e) except Exception as e: return self.create_error_response(str(e)) @@ -648,6 +654,8 @@ class OpenAIServingResponses(OpenAIServing): status = "incomplete" elif context.finish_reason == "abort": status = "cancelled" + else: + self._raise_if_error(context.finish_reason, request.request_id) else: status = "incomplete" elif isinstance(context, ParsableContext): @@ -673,6 +681,9 @@ class OpenAIServingResponses(OpenAIServing): assert len(final_res.outputs) == 1 final_output = final_res.outputs[0] + # finish_reason='error' indicates retryable internal error + self._raise_if_error(final_output.finish_reason, request.request_id) + output = self._make_response_output_items(request, final_output, tokenizer) if request.enable_response_messages: @@ -1066,6 +1077,8 @@ class OpenAIServingResponses(OpenAIServing): async for event in generator: event_deque.append(event) new_event_signal.set() # Signal new event available + except GenerationError as e: + response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) response = self.create_error_response(str(e)) @@ -1089,6 +1102,8 @@ class OpenAIServingResponses(OpenAIServing): ): try: response = await self.responses_full_generator(request, *args, **kwargs) + except GenerationError as e: + response = self._convert_generation_error_to_response(e) except Exception as e: logger.exception("Background request failed for %s", request.request_id) response = self.create_error_response(str(e)) @@ -1227,6 +1242,8 @@ class OpenAIServingResponses(OpenAIServing): continue if ctx.last_output.outputs: output = ctx.last_output.outputs[0] + # finish_reason='error' indicates a retryable error + self._raise_if_error(output.finish_reason, request.request_id) if reasoning_parser: delta_message = reasoning_parser.extract_reasoning_streaming( previous_text=previous_text, @@ -1522,6 +1539,9 @@ class OpenAIServingResponses(OpenAIServing): async for ctx in result_generator: assert isinstance(ctx, StreamingHarmonyContext) + # finish_reason='error' indicates a retryable error + self._raise_if_error(ctx.finish_reason, request.request_id) + if ctx.is_expecting_start(): current_output_index += 1 sent_output_item_added = False @@ -2016,18 +2036,25 @@ class OpenAIServingResponses(OpenAIServing): ) ) - async for event_data in processer( - request, - sampling_params, - result_generator, - context, - model_name, - tokenizer, - request_metadata, - created_time, - _increment_sequence_number_and_return, - ): - yield event_data + try: + async for event_data in processer( + request, + sampling_params, + result_generator, + context, + model_name, + tokenizer, + request_metadata, + created_time, + _increment_sequence_number_and_return, + ): + yield event_data + except GenerationError as e: + error_json = self._convert_generation_error_to_streaming_response(e) + yield _increment_sequence_number_and_return( + TypeAdapter(StreamingResponsesResponse).validate_json(error_json) + ) + return async def empty_async_generator(): # A hack to trick Python to think this is a generator but diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py index aa5089ffe84d7..bc827f045606c 100644 --- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -99,12 +99,7 @@ class MistralToolParser(ToolParser): self.bot_token = "[TOOL_CALLS]" self.bot_token_id = self.vocab.get(self.bot_token) self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL) - if not _is_pre_v11_tokeniser(self.model_tokenizer): - self.fn_name_regex = re.compile( - r"([a-zA-Z0-9_-]+)(\{[\s\S]*?\}+)", re.DOTALL - ) - else: - self.fn_name_regex = None + self._is_pre_v11 = _is_pre_v11_tokeniser(self.model_tokenizer) if self.bot_token_id is None: raise RuntimeError( @@ -148,23 +143,24 @@ class MistralToolParser(ToolParser): tool_content = model_output.replace(self.bot_token, "").strip() try: - # we first try to directly load the json as parsing very nested - # jsons is difficult try: - if self.fn_name_regex: + if not self._is_pre_v11: function_call_arr = [] for single_tool_content in model_output.split(self.bot_token): - matches = self.fn_name_regex.findall(single_tool_content) + if "{" not in single_tool_content: + continue - for match in matches: - fn_name = match[0] - args = match[1] + end_name = single_tool_content.find("{") + fn_name, args = ( + single_tool_content[:end_name], + single_tool_content[end_name:], + ) - # fn_name is encoded outside serialized json dump - # only arguments are serialized - function_call_arr.append( - {"name": fn_name, "arguments": json.loads(args)} - ) + # fn_name is encoded outside serialized json dump + # only arguments are serialized + function_call_arr.append( + {"name": fn_name, "arguments": json.loads(args)} + ) else: function_call_arr = json.loads(tool_content) except json.JSONDecodeError: diff --git a/vllm/entrypoints/responses_utils.py b/vllm/entrypoints/responses_utils.py index fbc137bac4543..99080fa43cb8e 100644 --- a/vllm/entrypoints/responses_utils.py +++ b/vllm/entrypoints/responses_utils.py @@ -22,6 +22,7 @@ from openai.types.responses.response_reasoning_item import ResponseReasoningItem from openai.types.responses.tool import Tool from vllm import envs +from vllm.entrypoints.constants import MCP_PREFIX from vllm.entrypoints.openai.protocol import ( ChatCompletionMessageParam, ResponseInputOutputItem, @@ -44,13 +45,13 @@ def make_response_output_items_from_parsable_context( ) if isinstance(output_messages[-1], ResponseFunctionToolCall): mcp_message = McpCall( - id=f"mcp_{random_uuid()}", + id=f"{MCP_PREFIX}{random_uuid()}", arguments=output_messages[-1].arguments, name=output_messages[-1].name, server_label=output_messages[ -1 ].name, # TODO: store the server label - type="mcp_call", + type=f"{MCP_PREFIX}call", status="completed", output=message.output, # TODO: support error output @@ -98,12 +99,63 @@ def construct_input_messages( if isinstance(request_input, str): messages.append({"role": "user", "content": request_input}) else: - for item in request_input: - messages.append(construct_chat_message_with_tool_call(item)) + input_messages = construct_chat_messages_with_tool_call(request_input) + messages.extend(input_messages) return messages -def construct_chat_message_with_tool_call( +def _maybe_combine_reasoning_and_tool_call( + item: ResponseInputOutputItem, messages: list[ChatCompletionMessageParam] +) -> ChatCompletionMessageParam | None: + """Many models treat MCP calls and reasoning as a single message. + This function checks if the last message is a reasoning message and + the current message is a tool call""" + if not ( + isinstance(item, ResponseFunctionToolCall) and item.id.startswith(MCP_PREFIX) + ): + return None + if len(messages) == 0: + return None + last_message = messages[-1] + if not ( + last_message.get("role") == "assistant" + and last_message.get("reasoning") is not None + ): + return None + + last_message["tool_calls"] = [ + ChatCompletionMessageToolCallParam( + id=item.call_id, + function=FunctionCallTool( + name=item.name, + arguments=item.arguments, + ), + type="function", + ) + ] + return last_message + + +def construct_chat_messages_with_tool_call( + input_messages: list[ResponseInputOutputItem], +) -> list[ChatCompletionMessageParam]: + """This function wraps _construct_single_message_from_response_item + Because some chatMessages come from multiple response items + for example a reasoning item and a MCP tool call are two response items + but are one chat message + """ + messages: list[ChatCompletionMessageParam] = [] + for item in input_messages: + maybe_combined_message = _maybe_combine_reasoning_and_tool_call(item, messages) + if maybe_combined_message is not None: + messages[-1] = maybe_combined_message + else: + messages.append(_construct_single_message_from_response_item(item)) + + return messages + + +def _construct_single_message_from_response_item( item: ResponseInputOutputItem, ) -> ChatCompletionMessageParam: if isinstance(item, ResponseFunctionToolCall): diff --git a/vllm/entrypoints/serve/profile/api_router.py b/vllm/entrypoints/serve/profile/api_router.py index 166f13764eb36..eeed6b45ef4e9 100644 --- a/vllm/entrypoints/serve/profile/api_router.py +++ b/vllm/entrypoints/serve/profile/api_router.py @@ -5,7 +5,7 @@ from fastapi import APIRouter, FastAPI, Request from fastapi.responses import Response -import vllm.envs as envs +from vllm.config import ProfilerConfig from vllm.engine.protocol import EngineClient from vllm.logger import init_logger @@ -35,15 +35,12 @@ async def stop_profile(raw_request: Request): def attach_router(app: FastAPI): - if envs.VLLM_TORCH_PROFILER_DIR: + profiler_config = getattr(app.state.args, "profiler_config", None) + assert profiler_config is None or isinstance(profiler_config, ProfilerConfig) + if profiler_config is not None and profiler_config.profiler is not None: logger.warning_once( - "Torch Profiler is enabled in the API server. This should ONLY be " - "used for local development!" + "Profiler with mode '%s' is enabled in the " + "API server. This should ONLY be used for local development!", + profiler_config.profiler, ) - elif envs.VLLM_TORCH_CUDA_PROFILE: - logger.warning_once( - "CUDA Profiler is enabled in the API server. This should ONLY be " - "used for local development!" - ) - if envs.VLLM_TORCH_PROFILER_DIR or envs.VLLM_TORCH_CUDA_PROFILE: app.include_router(router) diff --git a/vllm/envs.py b/vllm/envs.py index 37711dece9abc..cb75ba1a62de9 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -72,9 +72,9 @@ if TYPE_CHECKING: VLLM_MAX_AUDIO_CLIP_FILESIZE_MB: int = 25 VLLM_VIDEO_LOADER_BACKEND: str = "opencv" VLLM_MEDIA_CONNECTOR: str = "http" - VLLM_MM_INPUT_CACHE_GIB: int = 4 VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" + VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -88,20 +88,23 @@ if TYPE_CHECKING: VLLM_HTTP_TIMEOUT_KEEP_ALIVE: int = 5 # seconds VLLM_PLUGINS: list[str] | None = None VLLM_LORA_RESOLVER_CACHE_DIR: str | None = None - VLLM_TORCH_CUDA_PROFILE: bool = False + # Deprecated env variables for profiling, kept for backward compatibility + # See also vllm/config/profiler.py and `--profiler-config` argument + VLLM_TORCH_CUDA_PROFILE: str | None = None VLLM_TORCH_PROFILER_DIR: str | None = None - VLLM_TORCH_PROFILER_RECORD_SHAPES: bool = False - VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: bool = False - VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: bool = False + VLLM_TORCH_PROFILER_RECORD_SHAPES: str | None = None + VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY: str | None = None + VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM: str | None = None + VLLM_TORCH_PROFILER_WITH_STACK: str | None = None + VLLM_TORCH_PROFILER_WITH_FLOPS: str | None = None + VLLM_TORCH_PROFILER_USE_GZIP: str | None = None + VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: str | None = None + VLLM_PROFILER_DELAY_ITERS: str | None = None + VLLM_PROFILER_MAX_ITERS: str | None = None + # End of deprecated env variables for profiling VLLM_USE_AOT_COMPILE: bool = False VLLM_USE_BYTECODE_HOOK: bool = False VLLM_FORCE_AOT_LOAD: bool = False - VLLM_TORCH_PROFILER_WITH_STACK: bool = True - VLLM_TORCH_PROFILER_WITH_FLOPS: bool = False - VLLM_PROFILER_DELAY_ITERS: int = 0 - VLLM_PROFILER_MAX_ITERS: int = 0 - VLLM_TORCH_PROFILER_USE_GZIP: bool = True - VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: bool = True VLLM_USE_TRITON_AWQ: bool = False VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False VLLM_SKIP_P2P_CHECK: bool = False @@ -144,6 +147,7 @@ if TYPE_CHECKING: VLLM_DP_MASTER_IP: str = "" VLLM_DP_MASTER_PORT: int = 0 VLLM_MOE_DP_CHUNK_SIZE: int = 256 + VLLM_ENABLE_MOE_DP_CHUNK: bool = True VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False VLLM_RAY_DP_PACK_STRATEGY: Literal["strict", "fill", "span"] = "strict" VLLM_MARLIN_USE_ATOMIC_ADD: bool = False @@ -451,6 +455,16 @@ environment_variables: dict[str, Callable[[], Any]] = { # Main CUDA version of vLLM. This follows PyTorch but can be overridden. "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9", + # Controls PyTorch float32 matmul precision mode within vLLM workers. + # Accepted values: + # - "ieee" (default): force full IEEE FP32 matmul precision. + # - "tf32": enable TensorFloat32-based fast matmul. + "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( + "VLLM_FLOAT32_MATMUL_PRECISION", + "ieee", + ["ieee", "tf32"], + case_sensitive=False, + ), # Maximum number of compilation jobs to run in parallel. # By default this is the number of CPUs "MAX_JOBS": lambda: os.getenv("MAX_JOBS", None), @@ -773,9 +787,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # imported at runtime. # If a non-existing backend is used, an AssertionError will be thrown. "VLLM_MEDIA_CONNECTOR": lambda: os.getenv("VLLM_MEDIA_CONNECTOR", "http"), - # [DEPRECATED] Cache size (in GiB per process) for multimodal input cache - # Default is 4 GiB per API process + 4 GiB per engine core process - "VLLM_MM_INPUT_CACHE_GIB": lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_GIB", "4")), # Path to the XLA persistent cache directory. # Only used for XLA devices such as TPUs. "VLLM_XLA_CACHE_PATH": lambda: os.path.expanduser( @@ -840,71 +851,52 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_LORA_RESOLVER_CACHE_DIR": lambda: os.getenv( "VLLM_LORA_RESOLVER_CACHE_DIR", None ), - # Enables torch CUDA profiling if set. - # On NVIDIA GPUs, this will start/stop cudaProfilerApi when triggered. - "VLLM_TORCH_CUDA_PROFILE": lambda: bool( - os.getenv("VLLM_TORCH_CUDA_PROFILE", "0") != "0" - ), + # Enables torch CUDA profiling if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_CUDA_PROFILE": lambda: os.getenv("VLLM_TORCH_CUDA_PROFILE"), # Enables torch profiler if set. - # Both AsyncLLM's CPU traces as well as workers' - # traces (CPU & GPU) will be saved under this directory. - # Note that it must be an absolute path. - "VLLM_TORCH_PROFILER_DIR": lambda: ( - None - if (val := os.getenv("VLLM_TORCH_PROFILER_DIR")) is None - else ( - val - if val.startswith("gs://") and val[5:] and val[5] != "/" - else os.path.abspath(os.path.expanduser(val)) - ) + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DIR": lambda: os.getenv("VLLM_TORCH_PROFILER_DIR"), + # Enable torch profiler to record shapes if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES") ), - # Enable torch profiler to record shapes if set - # VLLM_TORCH_PROFILER_RECORD_SHAPES=1. If not set, torch profiler will - # not record shapes. - "VLLM_TORCH_PROFILER_RECORD_SHAPES": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_RECORD_SHAPES", "0") != "0" + # Enable torch profiler to profile memory if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY") ), - # Enable torch profiler to profile memory if set - # VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY=1. If not set, torch profiler - # will not profile memory. - "VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY", "0") != "0" + # Enable torch profiler to profile stack if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_STACK": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_STACK") ), - # Enable torch profiler to profile stack if set - # VLLM_TORCH_PROFILER_WITH_STACK=1. If not set, torch profiler WILL - # profile stack by default. - "VLLM_TORCH_PROFILER_WITH_STACK": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_STACK", "1") != "0" + # Enable torch profiler to profile flops if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS") ), - # Enable torch profiler to profile flops if set - # VLLM_TORCH_PROFILER_WITH_FLOPS=1. If not set, torch profiler will - # not profile flops. - "VLLM_TORCH_PROFILER_WITH_FLOPS": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_WITH_FLOPS", "0") != "0" - ), - # Disable torch profiling of the AsyncLLMEngine process. - # If set to 1, will not profile the engine process. - "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM", "0") != "0" + # Disable torch profiling of the AsyncLLMEngine process if set to 1. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM") ), # Delay number of iterations before starting profiling when using # the torch/torch CUDA profiler. If set to 0, will start profiling immediately. - "VLLM_PROFILER_DELAY_ITERS": lambda: int( - os.getenv("VLLM_PROFILER_DELAY_ITERS", "0") - ), + # Deprecated, see profiler_config. + "VLLM_PROFILER_DELAY_ITERS": lambda: (os.getenv("VLLM_PROFILER_DELAY_ITERS")), # Maximum number of iterations to profile when using the torch/torch CUDA profiler. # If set to 0, will not limit the number of iterations. - "VLLM_PROFILER_MAX_ITERS": lambda: int(os.getenv("VLLM_PROFILER_MAX_ITERS", "0")), + "VLLM_PROFILER_MAX_ITERS": lambda: os.getenv("VLLM_PROFILER_MAX_ITERS"), # Control whether torch profiler gzip-compresses profiling files. - # Set VLLM_TORCH_PROFILER_USE_GZIP=0 to disable gzip (enabled by default). - "VLLM_TORCH_PROFILER_USE_GZIP": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_USE_GZIP", "1") != "0" - ), + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_USE_GZIP": lambda: os.getenv("VLLM_TORCH_PROFILER_USE_GZIP"), # Control whether torch profiler dumps the self_cuda_time_total table. - # Set VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL=0 to disable dumping - # (enabled by default). - "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: bool( - os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL", "1") != "0" + # Set to 0 to disable dumping the table. + # Deprecated, see profiler_config. + "VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL": lambda: ( + os.getenv("VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL") ), # If set, vLLM will use Triton implementations of AWQ. "VLLM_USE_TRITON_AWQ": lambda: bool(int(os.getenv("VLLM_USE_TRITON_AWQ", "0"))), @@ -1101,6 +1093,9 @@ environment_variables: dict[str, Callable[[], Any]] = { # rank. All DP ranks process the activations in VLLM_MOE_DP_CHUNK_SIZE # units. "VLLM_MOE_DP_CHUNK_SIZE": lambda: int(os.getenv("VLLM_MOE_DP_CHUNK_SIZE", "256")), + "VLLM_ENABLE_MOE_DP_CHUNK": lambda: bool( + int(os.getenv("VLLM_ENABLE_MOE_DP_CHUNK", "1")) + ), # Randomize inputs during dummy runs when using Data Parallel "VLLM_RANDOMIZE_DP_DUMMY_INPUTS": lambda: os.environ.get( "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0" @@ -1583,6 +1578,12 @@ def __getattr__(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") +def _is_envs_cache_enabled() -> bool: + """Checked if __getattr__ is wrapped with functools.cache""" + global __getattr__ + return hasattr(__getattr__, "cache_clear") + + def enable_envs_cache() -> None: """ Enables caching of environment variables. This is useful for performance @@ -1593,6 +1594,9 @@ def enable_envs_cache() -> None: runtime overhead. This also means that environment variables should NOT be updated after the service is initialized. """ + if _is_envs_cache_enabled(): + # Avoid wrapping functools.cache multiple times + return # Tag __getattr__ with functools.cache global __getattr__ __getattr__ = functools.cache(__getattr__) @@ -1602,6 +1606,17 @@ def enable_envs_cache() -> None: __getattr__(key) +def disable_envs_cache() -> None: + """ + Resets the environment variables cache. It could be used to isolate environments + between unit tests. + """ + global __getattr__ + # If __getattr__ is wrapped by functions.cache, unwrap the caching layer. + if _is_envs_cache_enabled(): + __getattr__ = __getattr__.__wrapped__ + + def __dir__(): return list(environment_variables.keys()) @@ -1664,7 +1679,6 @@ def compile_factors() -> dict[str, object]: "VLLM_MEDIA_CONNECTOR", "VLLM_ASSETS_CACHE", "VLLM_ASSETS_CACHE_MODEL_CLEAN", - "VLLM_MM_INPUT_CACHE_GIB", "VLLM_WORKER_MULTIPROC_METHOD", "VLLM_ENABLE_V1_MULTIPROCESSING", "VLLM_V1_OUTPUT_PROC_CHUNK_SIZE", diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 173d366267e87..033cc1f544b3b 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -292,7 +292,7 @@ def set_forward_context( if num_tokens_across_dp is None: assert ubatch_slices is None assert num_tokens is not None - _, num_tokens_across_dp = coordinate_batch_across_dp( + _, num_tokens_across_dp, _ = coordinate_batch_across_dp( num_tokens_unpadded=num_tokens, parallel_config=vllm_config.parallel_config, allow_microbatching=False, diff --git a/vllm/logger.py b/vllm/logger.py index 3b7bb1f22ec96..5506e09b8a65b 100644 --- a/vllm/logger.py +++ b/vllm/logger.py @@ -229,6 +229,11 @@ def suppress_logging(level: int = logging.INFO) -> Generator[None, Any, None]: # guaranteed by the Python GIL. _configure_vllm_root_logger() +# Transformers uses httpx to access the Hugging Face Hub. httpx is quite verbose, +# so we set its logging level to WARNING when vLLM's logging level is INFO. +if envs.VLLM_LOGGING_LEVEL == "INFO": + logging.getLogger("httpx").setLevel(logging.WARNING) + logger = init_logger(__name__) diff --git a/vllm/lora/lora_model.py b/vllm/lora/lora_model.py index db170f13ae1c7..f5e36697ed18c 100644 --- a/vllm/lora/lora_model.py +++ b/vllm/lora/lora_model.py @@ -3,7 +3,7 @@ import os -import safetensors.torch +import safetensors import torch from vllm.logger import init_logger diff --git a/vllm/model_executor/layers/batch_invariant.py b/vllm/model_executor/layers/batch_invariant.py index 4154122636dcf..b14e7dad77f9a 100644 --- a/vllm/model_executor/layers/batch_invariant.py +++ b/vllm/model_executor/layers/batch_invariant.py @@ -935,7 +935,11 @@ def enable_batch_invariant_mode(): # Batch invariant matmuls are no longer needed after cublas overrides if not is_torch_equal_or_newer("2.10.0.dev"): - if current_platform.is_device_capability(100): + if ( + current_platform.is_device_capability(100) + or current_platform.is_device_capability(80) + or current_platform.is_device_capability(89) + ): # For PyTorch 2.9, B200 uses GEMV for bs=1 # Requires https://github.com/pytorch/pytorch/pull/166735 _batch_invariant_LIB.impl("aten::mm", mm_batch_invariant, "CUDA") @@ -1006,11 +1010,11 @@ def override_envs_for_invariance(): "FLASH_ATTN", # best supported backend "FLASHINFER", "FLASH_ATTN_MLA", + "TRITON_MLA", # Not yet supported MLA backends # "FLASHMLA", # "FLEX_ATTENTION", # IMA issue even if we disable batch invariance # "FLASHINFER_MLA", https://github.com/vllm-project/vllm/pull/28967 - # "TRITON_MLA", ] if curr_attn_backend not in supported_backends: error = ( diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 9103e84aa7057..d71cfc5ad8200 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -4,7 +4,10 @@ from contextlib import contextmanager from typing import Any -from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig +from vllm.model_executor.layers.fused_moe.config import ( + FusedMoEConfig, + RoutingMethodType, +) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -49,6 +52,7 @@ __all__ = [ "FusedMoEPermuteExpertsUnpermute", "FusedMoEActivationFormat", "FusedMoEPrepareAndFinalize", + "RoutingMethodType", "SharedFusedMoE", "activation_without_mul", "override_config", @@ -63,8 +67,10 @@ if HAS_TRITON: from vllm.model_executor.layers.fused_moe.cutlass_moe import ( CutlassBatchedExpertsFp8, CutlassExpertsFp8, + CutlassExpertsW4A8Fp8, cutlass_moe_fp4, cutlass_moe_fp8, + cutlass_moe_w4a8_fp8, ) from vllm.model_executor.layers.fused_moe.deep_gemm_moe import DeepGemmExperts from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( @@ -88,8 +94,10 @@ if HAS_TRITON: "grouped_topk", "cutlass_moe_fp8", "cutlass_moe_fp4", + "cutlass_moe_w4a8_fp8", "CutlassExpertsFp8", "CutlassBatchedExpertsFp8", + "CutlassExpertsW4A8Fp8", "TritonExperts", "BatchedTritonExperts", "DeepGemmExperts", diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index e52845dfa246d..5eb6bc4829adf 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -143,6 +143,7 @@ class FusedMoEQuantDesc: scale: Union[torch.Tensor, "PrecisionConfig", None] = None # Quantization alphas or gscales, used for nvfp4 types. + # W4A8 FP8: used for per-channel scales # TODO(bnell): put some of these in subclasses alpha_or_gscale: torch.Tensor | None = None @@ -442,7 +443,9 @@ class FusedMoEQuantConfig: - a1_scale: Optional scale to be used for a1. - a2_scale: Optional scale to be used for a2. - g1_alphas: Optional global quantization scales for w1 (for nvfp4). + per-channel scales for w1 (for W4A8 FP8). - g2_alphas: Optional global quantization scales for w2 (for nvfp4). + per-channel scales for w2 (for W4A8 FP8). - a1_gscale: Optional global quantization scales for a1 (for nvfp4). - a2_gscale: Optional global quantization scales for a2 (for nvfp4). - w1_bias: Optional biases for w1 (GPT OSS Triton). @@ -461,6 +464,7 @@ class FusedMoEQuantConfig: "mxfp4", "mxfp6_e3m2", "mxfp6_e2m3", + "int4", } if weight_dtype is None: @@ -671,6 +675,67 @@ def int8_w8a16_moe_quant_config( ) +def int4_w4afp8_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + g1_alphas: torch.Tensor, + g2_alphas: torch.Tensor, + per_act_token_quant: bool = False, + per_out_ch_quant: bool = False, + block_shape: list[int] | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for fp8 activations and int4 weights. + """ + return FusedMoEQuantConfig.make( + torch.float8_e4m3fn, # quant dtype for activations + w1_scale=w1_scale, + w2_scale=w2_scale, + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + per_act_token_quant=per_act_token_quant, + per_out_ch_quant=per_out_ch_quant, + block_shape=block_shape, + weight_dtype="int4", # weight dtype for weights + ) + + +def awq_marlin_moe_quant_config( + w1_scale: torch.Tensor, + w2_scale: torch.Tensor, + w1_zp: torch.Tensor | None, + w2_zp: torch.Tensor | None, + weight_bits: int, + group_size: int, + w1_bias: torch.Tensor | None = None, + w2_bias: torch.Tensor | None = None, +) -> FusedMoEQuantConfig: + """ + Construct a quant config for awq marlin quantization. + """ + from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape + + w_shape = None if group_size == -1 else GroupShape(row=1, col=group_size) + + # Activations are NOT quantized for AWQ (fp16/bf16) + a_shape = w_shape # Same as weight shape for alignment + + # Determine weight dtype + if weight_bits == 4: + weight_dtype = "int4" + elif weight_bits == 8: + weight_dtype = torch.int8 + else: + raise ValueError(f"Unsupported weight_bits: {weight_bits}") + + return FusedMoEQuantConfig( + _a1=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _a2=FusedMoEQuantDesc(dtype=None, shape=a_shape), + _w1=FusedMoEQuantDesc(weight_dtype, w_shape, w1_scale, None, w1_zp, w1_bias), + _w2=FusedMoEQuantDesc(weight_dtype, w_shape, w2_scale, None, w2_zp, w2_bias), + ) + + def biased_moe_quant_config( w1_bias: torch.Tensor | None, w2_bias: torch.Tensor | None, diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 30144ca5452eb..552e38a71bf98 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -1052,3 +1052,404 @@ def run_cutlass_block_scaled_fused_experts( return ( c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype) ).sum(dim=1) + + +# W4A8 +def run_cutlass_moe_w4a8_fp8( + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_ids: torch.Tensor, + activation_callable: Callable, + global_num_experts: int, + expert_map: torch.Tensor | None, + w1_scale: torch.Tensor | None, + w2_scale: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + w1_chan_scale: torch.Tensor, + w2_chan_scale: torch.Tensor, + a_strides1: torch.Tensor, + a_strides2: torch.Tensor, + b_strides1: torch.Tensor, + b_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + s_strides1: torch.Tensor, + s_strides2: torch.Tensor, + workspace13: torch.Tensor, + workspace2: torch.Tensor, + expert_num_tokens: torch.Tensor | None, + out_dtype: torch.dtype, + per_act_token: bool, + per_out_ch: bool, + use_batched_format: bool, + topk_weights: torch.Tensor | None, + group_size: int, +): + a1q = hidden_states + M = a1q.size(0) + local_E = w1.size(0) + device = a1q.device + _, K, N_packed = w2.shape + N = N_packed * 8 # logical N, pack 8 int4 into 1 int32 + + assert per_act_token, "W4A8 must use per-token scales" + assert per_out_ch, "W4A8 must use per-channel scales" + assert w1_scale is not None + assert w2_scale is not None + assert w1_scale.dtype == torch.float8_e4m3fn + assert w2_scale.dtype == torch.float8_e4m3fn + assert w1.dtype == torch.int32 + assert w2.dtype == torch.int32 + assert w1_chan_scale.dtype == torch.float32 + assert w2_chan_scale.dtype == torch.float32 + assert w1.size(0) == w2.size(0), "Weights expert number mismatch" + assert a1q_scale is not None + assert a2_scale is None + assert out_dtype in [torch.bfloat16], f"Invalid output dtype: {out_dtype}" + if expert_map is not None: + assert expert_num_tokens is None + assert not use_batched_format, "batched format not supported yet" + assert group_size == 128, f"Only group size 128 supported but got {group_size=}" + + assert global_num_experts != -1 + assert w1.size(2) * 8 == K, ( + f"w1 hidden size mismatch: got {w1.size(2) * 8}, expected {K=}" + ) + + # Translate info from expert_map to topk_ids + if expert_map is not None: + local_topk_ids = torch.where( + expert_map[topk_ids] != -1, expert_map[topk_ids], -1 + ) + else: + local_topk_ids = topk_ids + + topk = local_topk_ids.size(1) + a1q_perm = _resize_cache(workspace2.view(dtype=torch.float8_e4m3fn), (M * topk, K)) + mm1_out = _resize_cache(workspace13, (M * topk, N * 2)) + act_out = _resize_cache(workspace2, (M * topk, N)) + # original workspace are based on input hidden_states dtype (bf16) + quant_out = _resize_cache( + workspace13.view(dtype=torch.float8_e4m3fn), (M * topk, N) + ) + mm2_out = _resize_cache(workspace2, (M * topk, K)) + + problem_sizes1 = torch.empty( + (global_num_experts, 3), dtype=torch.int32, device=device + ) + problem_sizes2 = torch.empty( + (global_num_experts, 3), dtype=torch.int32, device=device + ) + + num_expert = global_num_experts if expert_map is None else expert_map.size(0) + # permuted a1q reuses workspace2 + a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, + a1q_scale, + topk_ids, + num_expert, + local_E, + expert_map, + permuted_hidden_states=a1q_perm, + ) + expert_offsets = expert_offsets[:-1] + + # For RS gemm SwapAB is always enabled (swap logical M, N in the problem shape) + ops.get_cutlass_moe_mm_problem_sizes( + local_topk_ids, + problem_sizes1, + problem_sizes2, + global_num_experts, + N, + K, + force_swap_ab=True, + ) + + ops.cutlass_w4a8_moe_mm( + mm1_out, + a1q, + w1, + a1q_scale, + w1_chan_scale, + w1_scale, + group_size, + expert_offsets, + problem_sizes1, + a_strides1, + b_strides1, + c_strides1, + s_strides1, + ) + + activation_callable(act_out, mm1_out) + + a2q, a2q_scale = ops.scaled_fp8_quant( + act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out + ) + + if expert_map is not None: + mm2_out.fill_(0) + + ops.cutlass_w4a8_moe_mm( + mm2_out, + a2q, + w2, + a2q_scale, + w2_chan_scale, + w2_scale, + group_size, + expert_offsets, + problem_sizes2, + a_strides2, + b_strides2, + c_strides2, + s_strides2, + ) + + # for non-chunking mode the output is resized from workspace13 + # so we need to make sure mm2_out uses workspace2. + moe_unpermute( + out=output, + permuted_hidden_states=mm2_out, + topk_weights=topk_weights, + inv_permuted_idx=inv_perm, + ) + + +class CutlassExpertsW4A8Fp8(mk.FusedMoEPermuteExpertsUnpermute): + def __init__( + self, + out_dtype: torch.dtype | None, + a_strides1: torch.Tensor, + a_strides2: torch.Tensor, + b_strides1: torch.Tensor, + b_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + s_strides1: torch.Tensor, + s_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + group_size: int, + ): + super().__init__(quant_config) + self.out_dtype = out_dtype + self.a_strides1 = a_strides1 + self.a_strides2 = a_strides2 + self.b_strides1 = b_strides1 + self.b_strides2 = b_strides2 + self.c_strides1 = c_strides1 + self.c_strides2 = c_strides2 + self.s_strides1 = s_strides1 + self.s_strides2 = s_strides2 + self.group_size = group_size + + @property + def activation_formats( + self, + ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]: + return ( + mk.FusedMoEActivationFormat.Standard, + mk.FusedMoEActivationFormat.Standard, + ) + + def supports_chunking(self) -> bool: + return True + + def supports_expert_map(self) -> bool: + return True + + def finalize_weight_and_reduce_impl(self) -> mk.TopKWeightAndReduce: + # topk weights and reduction are fused in moe_unpermute cuda kernel + return TopKWeightAndReduceNoOP() + + def workspace_dtype(self, act_dtype: torch.dtype) -> torch.dtype: + return self.out_dtype if self.out_dtype is not None else act_dtype + + def workspace_shapes( + self, + M: int, + N: int, + K: int, + topk: int, + global_num_experts: int, + local_num_experts: int, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]: + workspace1 = (M * topk, max(N, K)) + workspace2 = (M * topk, max(N // 2, K)) + output = (M, K) + return (workspace1, workspace2, output) + + def apply( + self, + output: torch.Tensor, + hidden_states: torch.Tensor, + w1: torch.Tensor, + w2: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + activation: str, + global_num_experts: int, + expert_map: torch.Tensor | None, + a1q_scale: torch.Tensor | None, + a2_scale: torch.Tensor | None, + workspace13: torch.Tensor | None, + workspace2: torch.Tensor | None, + expert_tokens_meta: mk.ExpertTokensMetadata | None, + apply_router_weight_on_input: bool, + ): + assert self.w1_zp is None, "w1_zp is not supported in CUTLASS MoE" + assert self.w2_zp is None, "w2_zp is not supported in CUTLASS MoE" + + expert_num_tokens = None + activation_callable = lambda o, i: self.activation(activation, o, i) + + use_batched_format = ( + self.activation_formats[0] == mk.FusedMoEActivationFormat.BatchedExperts + ) + assert not use_batched_format, "batched format not supported" + + in_dtype = hidden_states.dtype + + run_cutlass_moe_w4a8_fp8( + output, + hidden_states, + w1, + w2, + topk_ids, + activation_callable, + global_num_experts, + expert_map, + self.w1_scale, + self.w2_scale, + a1q_scale, + a2_scale, + self.g1_alphas, # per-channel scales + self.g2_alphas, # per-channel scales + self.a_strides1, + self.a_strides2, + self.b_strides1, + self.b_strides2, + self.c_strides1, + self.c_strides2, + self.s_strides1, + self.s_strides2, + workspace13, + workspace2, + expert_num_tokens, + self.out_dtype if self.out_dtype is not None else in_dtype, + self.per_act_token_quant, + self.per_out_ch_quant, + use_batched_format, + topk_weights, + self.group_size, + ) + + +def cutlass_moe_w4a8_fp8( + a: torch.Tensor, + w1_q: torch.Tensor, + w2_q: torch.Tensor, + topk_weights: torch.Tensor, + topk_ids: torch.Tensor, + a_strides1: torch.Tensor, + a_strides2: torch.Tensor, + b_strides1: torch.Tensor, + b_strides2: torch.Tensor, + c_strides1: torch.Tensor, + c_strides2: torch.Tensor, + s_strides1: torch.Tensor, + s_strides2: torch.Tensor, + quant_config: FusedMoEQuantConfig, + activation: str = "silu", + expert_map: torch.Tensor | None = None, + apply_router_weight_on_input: bool = False, + global_num_experts: int = -1, + group_size: int = 128, +) -> torch.Tensor: + """ + This function computes a w4a8-quantized Mixture of Experts (MoE) layer + using two sets of quantized weights, w1_q and w2_q, and top-k gating + mechanism. The matrix multiplications are implemented with CUTLASS + mixed-dtype grouped gemm. + + Parameters: + - a (torch.Tensor): The input tensor to the MoE layer. + Shape: [M, K] + - w1_q (torch.Tensor): The first set of fp8-quantized expert weights. + Shape: [num_experts, 2*N, K // packed_factor] + - w2_q (torch.Tensor): The second set of fp8-quantized expert weights. + Shape: [num_experts, K, N // packed_factor] + - topk_weights (torch.Tensor): The weights of each token->expert mapping. + - topk_ids (torch.Tensor): The token->expert mappings. + - a_strides1 (torch.Tensor): The input strides for the first gemm. + Shape: [num_experts] + - a_strides2 (torch.Tensor): The input strides for the second gemm. + Shape: [num_experts] + - b_strides1 (torch.Tensor): The packed layout for the first gemm weights. + Shape: [num_experts, 3] + dtype: torch.int32 + - b_strides2 (torch.Tensor): The packed layout for the second gemm weights. + Shape: [num_experts, 3] + dtype: torch.int32 + - c_strides1 (torch.Tensor): The output strides for the first gemm. + Shape: [num_experts] + - c_strides2 (torch.Tensor): The output strides for the second gemm. + Shape: [num_experts] + - s_strides1 (torch.Tensor): strides for the group-wise scales for the first gemm. + Shape: [num_experts, 2] + dtype: torch.int64 + - s_strides2 (torch.Tensor): strides for the group-wise scales for the second gemm. + Shape: [num_experts, 2] + dtype: torch.int64 + - per_act_token (Optional[bool]): Whether the scale is per-token or + per-tensor. + - activation (str): The activation function to use. + - expert_map (Optional[torch.Tensor]): In the case of Expert parallel, + every Rank is responsible for a subset of experts. expert_map is a + mapping from global expert-id to local expert-id. When expert_map[i] + is -1, it means that this Rank is not responsible for global + expert-id i. + - apply_router_weight_on_input (bool): When true, the topk weights are + applied directly on the inputs. This is only applicable when topk is 1. + - global_num_experts (int): The total number of experts. + - group_size (int): The number of weights per scale factor + + Returns: + - torch.Tensor: The bf16 output tensor after applying the MoE layer. + """ + assert quant_config is not None + + num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0) + + fn = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + CutlassExpertsW4A8Fp8( + out_dtype=a.dtype, + a_strides1=a_strides1, + a_strides2=a_strides2, + b_strides1=b_strides1, + b_strides2=b_strides2, + c_strides1=c_strides1, + c_strides2=c_strides2, + s_strides1=s_strides1, + s_strides2=s_strides2, + quant_config=quant_config, + group_size=group_size, + ), + ) + + return fn( + a, + w1_q, + w2_q, + topk_weights, + topk_ids, + activation=activation, + global_num_experts=num_experts, + expert_map=expert_map, + apply_router_weight_on_input=apply_router_weight_on_input, + ) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f3c158ee2f9dc..0b83a3f5c4803 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -895,6 +895,48 @@ def get_moe_configs( return None +def _ensure_block_size_k_divisible( + size_k: int, block_size_k: int, group_size: int +) -> int: + """Ensure block_size_k is a divisor of size_k and divisible by group_size. + + This ensures BLOCK_SIZE_K compatibility with MoeWNA16 CUDA kernel which + requires size_k % BLOCK_SIZE_K == 0 and BLOCK_SIZE_K % group_size == 0. + + Args: + size_k: The size_k dimension that must be divisible by result. + block_size_k: Preferred block size (will be adjusted if needed). + group_size: The result must be divisible by this. + + Returns: + A valid BLOCK_SIZE_K that divides size_k and is divisible by group_size. + """ + # Fast path: already valid + if size_k % block_size_k == 0 and block_size_k % group_size == 0: + return block_size_k + + # Find the largest value that: + # 1. Divides size_k (size_k % candidate == 0) + # 2. Is divisible by group_size (candidate % group_size == 0) + # 3. Is <= block_size_k (prefer smaller values close to block_size_k) + # + # Strategy: Search from min(block_size_k, size_k) down to group_size, + # stepping by group_size to ensure divisibility by group_size + max_search = min(block_size_k, size_k) + start = (max_search // group_size) * group_size + for candidate in range(start, group_size - 1, -group_size): + if size_k % candidate == 0: + return candidate + + # Fallback: if group_size divides size_k, use it + # This should always be true with correct group_size configuration + if size_k % group_size == 0: + return group_size + + # This should not happen with correct group_size, but ensure divisibility + return size_k + + def get_moe_wna16_block_config( config: dict[str, int], use_moe_wna16_cuda: bool, @@ -960,6 +1002,9 @@ def get_moe_wna16_block_config( # at the same time. block_size_n = 1024 + # Ensure BLOCK_SIZE_K is a divisor of size_k for CUDA kernel compatibility + block_size_k = _ensure_block_size_k_divisible(size_k, block_size_k, group_size) + return {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k} diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py index ef7090c349fc6..8c9d8a2777d58 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from abc import abstractmethod -from collections.abc import Callable import torch @@ -100,22 +99,5 @@ class FusedMoEMethodBase(QuantizeMethodBase): layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: raise NotImplementedError diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index b33e7fd8a0215..1947423bf4777 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable import torch @@ -97,23 +96,6 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, zero_expert_result = layer.select_experts( hidden_states=x, @@ -127,10 +109,10 @@ class FusedMoEModularMethod(FusedMoEMethodBase, CustomOp): topk_weights=topk_weights, topk_ids=topk_ids, inplace=self.allow_inplace, - activation=activation, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=None if self.disable_expert_map else expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=None if self.disable_expert_map else layer.expert_map, ) if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 9b4d77a060c29..7f803720d4770 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -33,10 +33,6 @@ from vllm.model_executor.layers.fused_moe.config import ( RoutingMethodType, ) from vllm.model_executor.layers.fused_moe.fused_moe import zero_experts_compute_triton -from vllm.model_executor.layers.fused_moe.modular_kernel import ( - FusedMoEPermuteExpertsUnpermute, - FusedMoEPrepareAndFinalize, -) from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( init_aiter_topK_meta_data, ) @@ -57,11 +53,8 @@ from vllm.utils.torch_utils import ( from vllm.v1.worker.ubatching import dbo_current_ubatch_id if current_platform.is_cuda_alike(): - from .fused_moe import eplb_map_to_physical_and_record, fused_experts + from .fused_moe import eplb_map_to_physical_and_record else: - fused_experts = None # type: ignore - FusedMoEPermuteExpertsUnpermute = object # type: ignore - FusedMoEPrepareAndFinalize = object # type: ignore def _eplb_map_to_physical_and_record( topk_ids: torch.Tensor, @@ -483,7 +476,7 @@ class FusedMoE(CustomOp): enable_eplb=self.enable_eplb, ) - self.expert_map: torch.Tensor | None + self._expert_map: torch.Tensor | None local_num_experts, expert_map, expert_mask = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, @@ -493,7 +486,7 @@ class FusedMoE(CustomOp): return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts - self.register_buffer("expert_map", expert_map) + self.register_buffer("_expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) self._maybe_init_expert_routing_tables() logger.info_once( @@ -506,10 +499,10 @@ class FusedMoE(CustomOp): self.expert_placement_strategy, self.local_num_experts, self.global_num_experts, - get_compressed_expert_map(self.expert_map), + get_compressed_expert_map(self._expert_map), ) else: - self.local_num_experts, self.expert_map, self.expert_mask = ( + self.local_num_experts, self._expert_map, self.expert_mask = ( self.global_num_experts, None, None, @@ -753,7 +746,7 @@ class FusedMoE(CustomOp): self.moe_parallel_config.use_pplx_kernels or self.moe_parallel_config.use_deepep_ll_kernels or (self.dp_size > 1 and self.use_flashinfer_cutlass_kernels) - ) + ) and envs.VLLM_ENABLE_MOE_DP_CHUNK @property def is_internal_router(self) -> bool: @@ -781,7 +774,7 @@ class FusedMoE(CustomOp): ), ) - if self.expert_map is None: + if self._expert_map is None: return None routing_tables = self.ensure_round_robin_expert_routing_tables( @@ -789,7 +782,7 @@ class FusedMoE(CustomOp): ep_size=self.ep_size, ep_rank=self.ep_rank, local_num_experts=self.local_num_experts, - device=self.expert_map.device, + device=self._expert_map.device, ) global_to_physical, physical_to_global, local_global = routing_tables @@ -840,8 +833,8 @@ class FusedMoE(CustomOp): def update_expert_map(self): # ep_size and ep_rank should already be updated - assert self.expert_map is not None - with self.expert_map.device: + assert self._expert_map is not None + with self._expert_map.device: local_num_experts, expert_map, expert_mask = determine_expert_map( ep_size=self.ep_size, ep_rank=self.ep_rank, @@ -851,7 +844,7 @@ class FusedMoE(CustomOp): return_expert_mask=self.rocm_aiter_fmoe_enabled, ) self.local_num_experts = local_num_experts - self.register_buffer("expert_map", expert_map) + self.register_buffer("_expert_map", expert_map) self.register_buffer("expert_mask", expert_mask) self._maybe_init_expert_routing_tables() if self.aiter_fmoe_shared_expert_enabled: @@ -888,7 +881,7 @@ class FusedMoE(CustomOp): # Record that the clone will be used by shared_experts_stream # to avoid gc issue from deallocation of hidden_states_clone # For more details: https://docs.pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html # noqa: E501 - # NOTE: We dont need shared_output.record_stream(current_stream()) + # NOTE: We don't need shared_output.record_stream(current_stream()) # because we synch the streams before using shared_output. hidden_states_clone.record_stream(self.shared_experts_stream) @@ -1068,9 +1061,9 @@ class FusedMoE(CustomOp): expert_data.copy_(loaded_weight) def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int: - if self.expert_map is None: + if self._expert_map is None: return expert_id - return self.expert_map[expert_id].item() + return self._expert_map[expert_id].item() def _init_aiter_shared_experts_topK_buffer( self, vllm_config: VllmConfig, dp_size: int @@ -1563,6 +1556,14 @@ class FusedMoE(CustomOp): f"EPLB is not supported for {self.quant_method.method_name}." ) + def valid_grouping() -> bool: + # Check if num_experts is greater than num_expert_group + # and is divisible by num_expert_group + num_experts = router_logits.shape[-1] + if num_experts <= self.num_expert_group: + return False + return num_experts % self.num_expert_group == 0 + indices_type = self.quant_method.topk_indices_dtype # Check if we should use a routing simulation strategy @@ -1577,7 +1578,7 @@ class FusedMoE(CustomOp): ) # DeepSeekv2 uses grouped_top_k - elif self.use_grouped_topk: + elif self.use_grouped_topk and valid_grouping(): assert self.topk_group is not None assert self.num_expert_group is not None if rocm_aiter_ops.is_fused_moe_enabled(): @@ -1744,6 +1745,12 @@ class FusedMoE(CustomOp): reduce_output(fused_output)[..., :og_hidden_states], ) + @property + def expert_map(self) -> torch.Tensor | None: + return ( + self._expert_map if not self.rocm_aiter_fmoe_enabled else self.expert_mask + ) + def forward_cuda( self, hidden_states: torch.Tensor, @@ -1805,24 +1812,6 @@ class FusedMoE(CustomOp): layer=self, x=staged_hidden_states, router_logits=staged_router_logits, - top_k=self.top_k, - renormalize=self.renormalize, - use_grouped_topk=self.use_grouped_topk, - global_num_experts=self.global_num_experts, - expert_map=self.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_mask, - topk_group=self.topk_group, - num_expert_group=self.num_expert_group, - custom_routing_function=self.custom_routing_function, - scoring_func=self.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, - e_score_correction_bias=self.e_score_correction_bias, - activation=self.activation, - enable_eplb=self.enable_eplb, - expert_load_view=self.expert_load_view, - logical_to_physical_map=self.logical_to_physical_map, - logical_replica_count=self.logical_replica_count, ) if has_separate_shared_experts: @@ -1968,25 +1957,6 @@ class FusedMoE(CustomOp): if do_naive_dispatch_combine else hidden_states, router_logits=router_logits, - top_k=self.top_k, - renormalize=self.renormalize, - use_grouped_topk=self.use_grouped_topk, - global_num_experts=self.global_num_experts, - expert_map=self.expert_map - if not self.rocm_aiter_fmoe_enabled - else self.expert_mask, - topk_group=self.topk_group, - num_expert_group=self.num_expert_group, - custom_routing_function=self.custom_routing_function, - scoring_func=self.scoring_func, - routed_scaling_factor=self.routed_scaling_factor, - e_score_correction_bias=self.e_score_correction_bias, - activation=self.activation, - apply_router_weight_on_input=self.apply_router_weight_on_input, - enable_eplb=self.enable_eplb, - expert_load_view=self.expert_load_view, - logical_to_physical_map=self.logical_to_physical_map, - logical_replica_count=self.logical_replica_count, ) if has_separate_shared_experts: diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 51d3299e7ddf1..075610ec588ae 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -367,7 +367,7 @@ class FusedMoEPrepareAndFinalize(ABC): class FusedMoEPermuteExpertsUnpermute(ABC): """ An abstract base class for the [Permute-Experts-Unpermute] step described - above. + above. """ def __init__( diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 48e5a8907f926..6182f10aa70f0 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable import torch import torch.nn.functional as F @@ -269,53 +268,14 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): def apply( self, - layer: torch.nn.Module, + layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - return self.forward( - x=x, layer=layer, + x=x, router_logits=router_logits, - top_k=top_k, - renormalize=renormalize, - use_grouped_topk=use_grouped_topk, - topk_group=topk_group, - num_expert_group=num_expert_group, - global_num_experts=global_num_experts, - expert_map=expert_map, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - enable_eplb=enable_eplb, - expert_load_view=expert_load_view, - logical_to_physical_map=logical_to_physical_map, - logical_replica_count=logical_replica_count, ) def get_fused_moe_quant_config( @@ -333,24 +293,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, zero_expert_result = layer.select_experts( hidden_states=x, @@ -364,9 +307,9 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - expert_map=expert_map, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=layer.expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) elif self.flashinfer_cutlass_moe_enabled: return self.flashinfer_cutlass_moe( @@ -375,8 +318,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: result = fused_experts( @@ -386,11 +329,11 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, + activation=layer.activation, quant_config=self.moe_quant_config, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, ) if layer.zero_expert_num != 0 and layer.zero_expert_type is not None: @@ -405,148 +348,101 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if ( - enable_eplb is not False - or expert_load_view is not None - or logical_to_physical_map is not None - or logical_replica_count is not None + layer.enable_eplb is not False + or layer.expert_load_view is not None + or layer.logical_to_physical_map is not None + or layer.logical_replica_count is not None ): raise NotImplementedError("Expert load balancing is not supported for CPU.") + return layer.cpu_fused_moe( layer, x, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, - global_num_experts, - expert_map, - custom_routing_function, - scoring_func, - routed_scaling_factor, - e_score_correction_bias, - apply_router_weight_on_input, - activation, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, + layer.global_num_experts, + layer.expert_map, + layer.custom_routing_function, + layer.scoring_func, + layer.routed_scaling_factor, + layer.e_score_correction_bias, + layer.apply_router_weight_on_input, + layer.activation, ) def forward_xpu( self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if ( - enable_eplb is not False - or expert_load_view is not None - or logical_to_physical_map is not None - or logical_replica_count is not None + layer.enable_eplb is not False + or layer.expert_load_view is not None + or layer.logical_to_physical_map is not None + or layer.logical_replica_count is not None ): raise NotImplementedError("Expert load balancing is not supported for XPU.") return layer.ipex_fusion( x, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, - custom_routing_function=custom_routing_function, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, + custom_routing_function=layer.custom_routing_function, ) def forward_tpu( self, layer: "FusedMoE", # type: ignore[name-defined] # noqa: F821 x: torch.Tensor, - use_grouped_topk: bool, - top_k: int, router_logits: torch.Tensor, - renormalize: bool, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert not use_grouped_topk - assert num_expert_group is None - assert topk_group is None - assert custom_routing_function is None - assert apply_router_weight_on_input is False - if scoring_func != "softmax": + assert not layer.use_grouped_topk + assert layer.num_expert_group is None + assert layer.topk_group is None + assert layer.custom_routing_function is None + assert layer.apply_router_weight_on_input is False + if layer.scoring_func != "softmax": raise NotImplementedError( "Only softmax scoring function is supported for TPU." ) - if e_score_correction_bias is not None: + if layer.e_score_correction_bias is not None: raise NotImplementedError( "Expert score correction bias is not supported for TPU." ) - assert activation == "silu", f"{activation} is not supported for TPU." - assert routed_scaling_factor == 1.0, ( - f"routed_scaling_factor {routed_scaling_factor} is not supported for TPU." + assert layer.activation == "silu", ( + f"{layer.activation} is not supported for TPU." + ) + assert layer.routed_scaling_factor == 1.0, ( + f"routed_scaling_factor {layer.routed_scaling_factor} is " + "not supported for TPU." ) if ( - enable_eplb is not False - or expert_load_view is not None - or logical_to_physical_map is not None - or logical_replica_count is not None + layer.enable_eplb is not False + or layer.expert_load_view is not None + or layer.logical_to_physical_map is not None + or layer.logical_replica_count is not None ): raise NotImplementedError("Expert load balancing is not supported for TPU.") return fused_moe_pallas( hidden_states=x, w1=layer.w13_weight, w2=layer.w2_weight, - topk=top_k, + topk=layer.top_k, gating_output=router_logits, - global_num_experts=global_num_experts, - expert_map=expert_map, - renormalize=renormalize, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + renormalize=layer.renormalize, ) if current_platform.is_tpu(): diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py index d463e181fd2db..3ed15ed7dd422 100644 --- a/vllm/model_executor/layers/quantization/awq_marlin.py +++ b/vllm/model_executor/layers/quantization/awq_marlin.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import TYPE_CHECKING, Any, Optional import torch @@ -471,6 +470,11 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): } ) + intermediate_size_full = extra_weight_attrs.pop( + "intermediate_size_full", intermediate_size_per_partition + ) + self.is_k_full = intermediate_size_per_partition == intermediate_size_full + w13_qweight = Parameter( torch.empty( num_experts, @@ -598,6 +602,13 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): ) replace_parameter(layer, "w2_qweight", marlin_w2_qweight) + # The modular kernel expects w13_weight and w2_weight, + # but AWQ uses w13_qweight and w2_qweight + # Alias for modular kernel + layer.w13_weight = layer.w13_qweight + # Alias for modular kernel + layer.w2_weight = layer.w2_qweight + # Why does this take the intermediate size for size_k? marlin_w13_scales = marlin_moe_permute_scales( s=layer.w13_scales, @@ -662,32 +673,96 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: - return None + from vllm.model_executor.layers.fused_moe.config import ( + awq_marlin_moe_quant_config, + ) + + return awq_marlin_moe_quant_config( + w1_scale=layer.w13_scales, + w2_scale=layer.w2_scales, + weight_bits=self.quant_config.weight_bits, + group_size=self.quant_config.group_size, + w1_zp=getattr(layer, "w13_qzeros", None) + if self.quant_config.zero_point + else None, + w2_zp=getattr(layer, "w2_qzeros", None) + if self.quant_config.zero_point + else None, + w1_bias=getattr(layer, "w13_bias", None), + w2_bias=getattr(layer, "w2_bias", None), + ) + + def select_gemm_impl( + self, + prepare_finalize, + layer: torch.nn.Module, + ): + """ + Select the GEMM implementation for AWQ-Marlin MoE. + Returns MarlinExperts configured for AWQ quantization. + This is ONLY used when LoRA is enabled. + Without LoRA, AWQ uses its own apply() method. + """ + # Only use modular kernels when LoRA is enabled + # Without LoRA, AWQ's own apply() method works fine and is more efficient + if not self.moe.is_lora_enabled: + raise NotImplementedError( + "AWQ-Marlin uses its own apply() method when LoRA is not enabled. " + "Modular kernels are only used for LoRA support." + ) + + from vllm.model_executor.layers.fused_moe import modular_kernel as mk + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + BatchedMarlinExperts, + MarlinExperts, + ) + + # Ensure quant config is initialized + assert self.moe_quant_config is not None, ( + "moe_quant_config must be initialized before select_gemm_impl" + ) + + w13_g_idx = getattr(layer, "w13_g_idx", None) + w2_g_idx = getattr(layer, "w2_g_idx", None) + w13_g_idx_sort_indices = getattr(layer, "w13_g_idx_sort_indices", None) + w2_g_idx_sort_indices = getattr(layer, "w2_g_idx_sort_indices", None) + + # Check if using batched expert format (for Expert Parallelism) + if ( + prepare_finalize.activation_format + == mk.FusedMoEActivationFormat.BatchedExperts + ): + # For batched format, use BatchedMarlinExperts + max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank() + assert max_num_tokens_per_rank is not None + return BatchedMarlinExperts( + max_num_tokens=max_num_tokens_per_rank, + num_dispatchers=prepare_finalize.num_dispatchers(), + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) + else: + # Standard Marlin experts for AWQ + return MarlinExperts( + quant_config=self.moe_quant_config, + w13_g_idx=w13_g_idx, + w2_g_idx=w2_g_idx, + w13_g_idx_sort_indices=w13_g_idx_sort_indices, + w2_g_idx_sort_indices=w2_g_idx_sort_indices, + is_k_full=self.is_k_full, + ) def apply( self, layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -708,9 +783,9 @@ class AWQMarlinMoEMethod(FusedMoEMethodBase): input_global_scale1=getattr(layer, "w13_input_global_scale", None), input_global_scale2=getattr(layer, "w2_input_global_scale", None), quant_type_id=self.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, w1_zeros=layer.w13_qzeros, w2_zeros=layer.w2_qzeros, workspace=layer.workspace, diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py index 1e57fa218b797..1fd959cb3423d 100644 --- a/vllm/model_executor/layers/quantization/bitsandbytes.py +++ b/vllm/model_executor/layers/quantization/bitsandbytes.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Union import torch @@ -498,23 +497,6 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -534,10 +516,10 @@ class BitsAndBytesMoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index b91ecb59fee18..f835584219cca 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -116,16 +116,37 @@ class CompressedTensorsConfig(QuantizationConfig): return "compressed-tensors" def apply_vllm_mapper(self, hf_to_vllm_mapper: "WeightsMapper"): - self.target_scheme_map = hf_to_vllm_mapper.apply_dict(self.target_scheme_map) - self.ignore = hf_to_vllm_mapper.apply_list(self.ignore) - self.sparsity_scheme_map = hf_to_vllm_mapper.apply_dict( - self.sparsity_scheme_map - ) - self.sparsity_ignore_list = hf_to_vllm_mapper.apply_list( - self.sparsity_ignore_list - ) + """ + Transform layer paths in config targets to match vLLM's naming. + + The WeightsMapper is designed for weight paths, but some backends + (e.g. transformers) use broad prefix mappings like "" -> "model." + which would incorrectly transform non-path targets. + + compressed-tensors targets can be: + - Layer paths: "layers.0.self_attn.q_proj" -> transformed + - Module class names: "Linear" -> preserved (no ".") + - Regex patterns: "re:.*proj" -> preserved (starts with "re:") + """ + + def _map_target(target: str) -> str | None: + is_layer_path = "." in target and not target.startswith("re:") + if is_layer_path: + return hf_to_vllm_mapper._map_name(target) + return target + + def _apply_dict(d: dict) -> dict: + return {k: v for t, v in d.items() if (k := _map_target(t)) is not None} + + def _apply_list(lst: list) -> list: + return [t for x in lst if (t := _map_target(x)) is not None] + + self.target_scheme_map = _apply_dict(self.target_scheme_map) + self.ignore = _apply_list(self.ignore) + self.sparsity_scheme_map = _apply_dict(self.sparsity_scheme_map) + self.sparsity_ignore_list = _apply_list(self.sparsity_ignore_list) if self.kv_cache_scheme is not None: - self.kv_cache_scheme = hf_to_vllm_mapper.apply_dict(self.kv_cache_scheme) + self.kv_cache_scheme = _apply_dict(self.kv_cache_scheme) def get_quant_method( self, @@ -256,7 +277,7 @@ class CompressedTensorsConfig(QuantizationConfig): if format is not None else is_activation_quantization_format(quant_format) ) - # TODO(czhu): w4a8fp8 is in packed-quantized format + # w4a8fp8 is in packed-quantized format # but needs input activation quantization input_activations = quant_config.get("input_activations") if act_quant_format or input_activations: diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py index 8013b29f733bb..5ad26f9318df3 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py @@ -2,7 +2,6 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import enum -from collections.abc import Callable from enum import Enum import torch @@ -33,6 +32,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, int4_w4a16_moe_quant_config, + int4_w4afp8_moe_quant_config, int8_w8a8_moe_quant_config, int8_w8a16_moe_quant_config, nvfp4_moe_quant_config, @@ -79,7 +79,11 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import ( from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( prepare_moe_fp8_layer_for_marlin, ) -from vllm.model_executor.layers.quantization.utils.quant_utils import swizzle_blockscale +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + convert_bf16_scales_to_fp8, + convert_packed_uint4b8_to_signed_int4_inplace, + swizzle_blockscale, +) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( all_close_1d, normalize_e4m3fn_to_e4m3fnuz, @@ -204,6 +208,11 @@ class CompressedTensorsMoEMethod(FusedMoEMethodBase): return CompressedTensorsW8A8Int8MoEMethod( weight_quant, input_quant, layer.moe_config ) + elif quant_config._is_fp8_w4a8_sm90(weight_quant, input_quant): + logger.info_once("Using CompressedTensorsW4A8Fp8MoEMethod") + return CompressedTensorsW4A8Fp8MoEMethod( + weight_quant, input_quant, layer.moe_config + ) elif quant_config._is_dynamic_token_w4a8_int(weight_quant, input_quant): return CompressedTensorsW4A8Int8MoEMethod( weight_quant, input_quant, layer.moe_config @@ -548,31 +557,14 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." if ( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): - if enable_eplb: + if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `CompressedTensorsW4A4MoEMethod` yet." ) @@ -581,12 +573,12 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): layer=layer, x=x, router_logits=router_logits, - top_k=top_k, - global_num_experts=global_num_experts, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - e_score_correction_bias=e_score_correction_bias, + top_k=layer.top_k, + global_num_experts=layer.global_num_experts, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + custom_routing_function=layer.custom_routing_function, + e_score_correction_bias=layer.e_score_correction_bias, ) topk_weights, topk_ids, _ = layer.select_experts( @@ -609,9 +601,9 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): global_scale1=layer.w13_weight_scale_2, global_scale2=layer.w2_weight_scale_2, quant_type_id=scalar_types.float4_e2m1f.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, workspace=layer.workspace, ) @@ -636,15 +628,15 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): topk_ids=topk_ids, quant_config=self.moe_quant_config, inplace=False, # TODO(shuw): fix later, now output is high prec - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 - assert expert_map is None, ( + assert layer.expert_map is None, ( "Expert Parallelism / expert_map " "is currently not supported for " "CompressedTensorsW4A4Nvfp4MoEMethod." @@ -660,7 +652,7 @@ class CompressedTensorsW4A4Nvfp4MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, quant_config=self.moe_quant_config, - apply_router_weight_on_input=apply_router_weight_on_input, + apply_router_weight_on_input=layer.apply_router_weight_on_input, # TODO(bnell): derive these from arguments m=x.shape[0], n=layer.w2_weight.shape[2] * 2, @@ -1178,23 +1170,6 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -1205,7 +1180,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): per_channel_quant = self.weight_quant.strategy == QuantizationStrategy.CHANNEL if self.use_marlin: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) return fused_marlin_moe( x, layer.w13_weight, @@ -1218,9 +1195,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, workspace=layer.workspace, ) @@ -1238,9 +1215,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -1260,10 +1237,12 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=None + if self.disable_expert_map + else layer.expert_map, # ??? quant_config=self.moe_quant_config, ) else: @@ -1280,9 +1259,9 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights, topk_ids, quant_config=self.moe_quant_config, - activation=activation, - global_num_experts=global_num_experts, - expert_map=None if self.disable_expert_map else expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=None if self.disable_expert_map else layer.expert_map, ab_strides1=self.ab_strides1_c_strides2, ab_strides2=self.ab_strides2, c_strides1=self.c_strides1, @@ -1304,10 +1283,10 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -1427,23 +1406,6 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -1459,10 +1421,10 @@ class CompressedTensorsW8A8Int8MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -1804,25 +1766,10 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -1843,9 +1790,9 @@ class CompressedTensorsWNA16MarlinMoEMethod(CompressedTensorsMoEMethod): input_global_scale1=getattr(layer, "w13_input_global_scale", None), input_global_scale2=getattr(layer, "w2_input_global_scale", None), quant_type_id=self.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, g_idx1=layer.w13_weight_g_idx, g_idx2=layer.w2_weight_g_idx, sort_indices1=layer.w13_g_idx_sort_indices, @@ -2047,23 +1994,6 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -2079,10 +2009,10 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -2362,32 +2292,15 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): def apply( self, - layer: torch.nn.Module, + layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor: - assert not enable_eplb, "EPLB not supported for W4A8-int MoE yet." - assert activation in ("silu", "swigluoai", "swiglu"), ( + assert not layer.enable_eplb, "EPLB not supported for W4A8-int MoE yet." + assert layer.activation in ("silu", "swigluoai", "swiglu"), ( "Only SiLU/SwiGLUGU/SwiGLUUG are supported." ) - assert expert_map is None, """expert_map/EP not implemented + assert layer.expert_map is None, """expert_map/EP not implemented for CPU dyn-4bit MoE.""" def _act_kind(s: str) -> int: @@ -2404,15 +2317,9 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): topk_weights, topk_ids = select_experts( hidden_states=x, router_logits=router_logits, - use_grouped_topk=use_grouped_topk, - top_k=top_k, - renormalize=renormalize, - topk_group=topk_group, - num_expert_group=num_expert_group, - custom_routing_function=custom_routing_function, - scoring_func=scoring_func, - routed_scaling_factor=routed_scaling_factor, - e_score_correction_bias=e_score_correction_bias, + top_k=layer.top_k, + use_grouped_topk=layer.use_grouped_topk, + renormalize=layer.renormalize, ) return torch.ops._C.dynamic_4bit_int_moe( @@ -2425,6 +2332,317 @@ class CompressedTensorsW4A8Int8MoEMethod(CompressedTensorsMoEMethod): layer.w2_in_features, layer.w13_out_features, layer.group_size, - apply_router_weight_on_input, - int(_act_kind(activation)), + layer.apply_router_weight_on_input, + int(_act_kind(layer.activation)), ) + + +class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): + def __init__( + self, + weight_quant: QuantizationArgs, + input_quant: QuantizationArgs, + moe: FusedMoEConfig, + layer_name: str | None = None, + ): + super().__init__(moe) + self.weight_quant = weight_quant + self.input_quant = input_quant + + self.group_size = self.weight_quant.group_size + self.num_bits = self.weight_quant.num_bits + self.packed_factor = 32 // self.num_bits + + assert self.weight_quant.symmetric, ( + "Only symmetric quantization is supported for W4A8 MoE" + ) + assert self.weight_quant.actorder != "group" + assert self.group_size == 128, "Only group size 128 supported for W4A8 MoE" + + self.disable_expert_map = False + self.layer_name = layer_name + + from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 + from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + ) + + self.quant_fp8 = QuantFP8(static=False, group_shape=GroupShape.PER_TOKEN) + + def create_weights( + self, + layer: torch.nn.Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + layer.intermediate_size_per_partition = intermediate_size_per_partition + layer.hidden_size = hidden_size + layer.num_experts = num_experts + layer.orig_dtype = params_dtype + layer.weight_block_size = None + + # requirement for CUTLASS reorder_tensor + assert hidden_size % 256 == 0, f"{hidden_size=} must be divisible by 256" + assert intermediate_size_per_partition % 256 == 0, ( + f"{intermediate_size_per_partition=} must be divisible by 256" + ) + # storage type, pack 8xint4 into int32 + params_dtype = torch.int32 + + # WEIGHTS + w13_weight_packed = torch.nn.Parameter( + torch.empty( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.packed_factor, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_packed", w13_weight_packed) + set_weight_attrs(w13_weight_packed, extra_weight_attrs) + + w2_weight_packed = torch.nn.Parameter( + torch.empty( + num_experts, + hidden_size, + intermediate_size_per_partition // self.packed_factor, + dtype=params_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_packed", w2_weight_packed) + set_weight_attrs(w2_weight_packed, extra_weight_attrs) + + # SCALES + # weight_scale refers to the group-wise scales + # they are initially loaded as bf16, we will convert to fp8 + # after loading + w13_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + 2 * intermediate_size_per_partition, + hidden_size // self.group_size, + dtype=layer.orig_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w13_weight_scale", w13_weight_scale) + + w2_weight_scale = torch.nn.Parameter( + torch.ones( + num_experts, + hidden_size, + intermediate_size_per_partition // self.group_size, + dtype=layer.orig_dtype, + ), + requires_grad=False, + ) + layer.register_parameter("w2_weight_scale", w2_weight_scale) + # Add PER-GROUP quantization for FusedMoE.weight_loader. + extra_weight_attrs.update( + {"quant_method": FusedMoeWeightScaleSupported.GROUP.value} + ) + set_weight_attrs(w13_weight_scale, extra_weight_attrs) + set_weight_attrs(w2_weight_scale, extra_weight_attrs) + + # weight shapes + w2_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + layer.register_parameter("w2_weight_shape", w2_weight_shape) + set_weight_attrs(w2_weight_shape, extra_weight_attrs) + w13_weight_shape = torch.nn.Parameter( + torch.empty(num_experts, 2), requires_grad=False + ) + layer.register_parameter("w13_weight_shape", w13_weight_shape) + set_weight_attrs(w13_weight_shape, extra_weight_attrs) + + # don't use input scales + layer.w13_input_scale = None + layer.w2_input_scale = None + + def process_weights_after_loading(self, layer): + device = layer.w13_weight_packed.device + + # STRIDES + # A, C + self.a_strides1_c_strides2 = torch.full( + (layer.local_num_experts,), + layer.hidden_size, + device=device, + dtype=torch.int64, + ) + self.a_strides2 = torch.full( + (layer.local_num_experts,), + layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64, + ) + self.c_strides1 = torch.full( + (layer.local_num_experts,), + 2 * layer.intermediate_size_per_partition, + device=device, + dtype=torch.int64, + ) + + # S (group-wise scales) + # sizeof(StrideS) = 16 bytes, so we need to use 2xint64 to encode it + self.s_strides1 = torch.zeros( + (layer.local_num_experts, 2), device=device, dtype=torch.int64 + ) + self.s_strides1[:, 0] = 2 * layer.intermediate_size_per_partition + + self.s_strides2 = torch.zeros( + (layer.local_num_experts, 2), device=device, dtype=torch.int64 + ) + self.s_strides2[:, 0] = layer.hidden_size + + # encode and reorder weight tensors, and get the layout to pass to + # the grouped gemm kernel. `b_strides1/2` specifies the entire layout + convert_packed_uint4b8_to_signed_int4_inplace(layer.w13_weight_packed) + w13_weight_shuffled, self.b_strides1 = ( + ops.cutlass_encode_and_reorder_int4b_grouped(layer.w13_weight_packed) + ) + replace_parameter(layer, "w13_weight_packed", w13_weight_shuffled) + convert_packed_uint4b8_to_signed_int4_inplace(layer.w2_weight_packed) + w2_weight_shuffled, self.b_strides2 = ( + ops.cutlass_encode_and_reorder_int4b_grouped(layer.w2_weight_packed) + ) + replace_parameter(layer, "w2_weight_packed", w2_weight_shuffled) + + # convert bf16 scales to (fp8_scales, channel_scales) + w13_weight_scale, w13_weight_chan_scale = convert_bf16_scales_to_fp8( + self.quant_fp8, layer.w13_weight_scale + ) + w2_weight_scale, w2_weight_chan_scale = convert_bf16_scales_to_fp8( + self.quant_fp8, layer.w2_weight_scale + ) + + # register channel scales + layer.register_parameter( + "w13_weight_chan_scale", + torch.nn.Parameter(w13_weight_chan_scale, requires_grad=False), + ) + layer.register_parameter( + "w2_weight_chan_scale", + torch.nn.Parameter(w2_weight_chan_scale, requires_grad=False), + ) + + # The scales are stored as (E, N, K // 128) but the kernel expects + # (E, K // 128, N) in row-major format, so we need to permute the last 2 dims + # and make it contiguous + w13_weight_scale_packed = ops.cutlass_pack_scale_fp8( + w13_weight_scale.permute(0, 2, 1).contiguous() + ) + replace_parameter(layer, "w13_weight_scale", w13_weight_scale_packed) + w2_weight_scale_packed = ops.cutlass_pack_scale_fp8( + w2_weight_scale.permute(0, 2, 1).contiguous() + ) + replace_parameter(layer, "w2_weight_scale", w2_weight_scale_packed) + + def maybe_make_prepare_finalize( + self, + routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None, + ) -> mk.FusedMoEPrepareAndFinalize | None: + return super().maybe_make_prepare_finalize(routing_tables) + + def get_fused_moe_quant_config( + self, layer: torch.nn.Module + ) -> FusedMoEQuantConfig | None: + # Store quantization scales; both per-group and per-channel + # Note we haven't specified the group size here because + # the quant config logic assumes group-wise scaling + # and channel-wise scaling are exclusive. + return int4_w4afp8_moe_quant_config( + w1_scale=layer.w13_weight_scale, # group scale + w2_scale=layer.w2_weight_scale, # group scale + g1_alphas=layer.w13_weight_chan_scale, + g2_alphas=layer.w2_weight_chan_scale, + per_act_token_quant=True, # always use dynamc per-token + per_out_ch_quant=True, # always use per-channel + ) + + def select_gemm_impl( + self, + prepare_finalize: mk.FusedMoEPrepareAndFinalize, + layer: torch.nn.Module, + ) -> mk.FusedMoEPermuteExpertsUnpermute: + assert self.moe_quant_config is not None + assert ( + prepare_finalize.activation_format == FusedMoEActivationFormat.Standard + ), "BatchedExperts not supported" + + from vllm.model_executor.layers.fused_moe import CutlassExpertsW4A8Fp8 + + experts: FusedMoEPermuteExpertsUnpermute + + logger.debug("CutlassExpertsW4A8Fp8(%s)", self.__class__.__name__) + experts = CutlassExpertsW4A8Fp8( + out_dtype=self.moe.in_dtype, + a_strides1=self.a_strides1_c_strides2, + a_strides2=self.a_strides2, + b_strides1=self.b_strides1, + b_strides2=self.b_strides2, + c_strides1=self.c_strides1, + c_strides2=self.a_strides1_c_strides2, + s_strides1=self.s_strides1, + s_strides2=self.s_strides2, + quant_config=self.moe_quant_config, + group_size=self.group_size, + ) + + num_dispatchers = prepare_finalize.num_dispatchers() + self.disable_expert_map = ( + num_dispatchers > 1 or not experts.supports_expert_map() + ) + + return experts + + def apply( + self, + layer: FusedMoE, + x: torch.Tensor, + router_logits: torch.Tensor, + ): + if layer.enable_eplb: + raise NotImplementedError( + "EPLB not supported for `CompressedTensorsW4A8Fp8MoEMethod` yet." + ) + assert self.moe_quant_config is not None + topk_weights, topk_ids, _ = layer.select_experts( + hidden_states=x, + router_logits=router_logits, + ) + + from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + cutlass_moe_w4a8_fp8, + ) + + return cutlass_moe_w4a8_fp8( + x, + layer.w13_weight_packed, + layer.w2_weight_packed, + topk_weights, + topk_ids, + quant_config=self.moe_quant_config, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=None if self.disable_expert_map else layer.expert_map, + a_strides1=self.a_strides1_c_strides2, + a_strides2=self.a_strides2, + b_strides1=self.b_strides1, + b_strides2=self.b_strides2, + c_strides1=self.c_strides1, + c_strides2=self.a_strides1_c_strides2, + s_strides1=self.s_strides1, + s_strides2=self.s_strides2, + group_size=self.group_size, + ) + + @property + def supports_eplb(self) -> bool: + return False diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py index 3afadc6eb7e5b..d2701a464f129 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_nvfp4.py @@ -28,7 +28,7 @@ class CompressedTensorsW4A16Fp4(CompressedTensorsScheme): @classmethod def get_min_capability(cls) -> int: - # dont restrict as emulations + # don't restrict as emulations return 80 def create_weights( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py index a23961e897534..9a25e08cbad75 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a8_fp8.py @@ -128,14 +128,15 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): ), ) - # TODO(czhu): allocate the packed fp8 scales memory here? - # the scales will be expanded by 8x via `cutlass_pack_scale_fp8` + # After loading, we will transform bf16 -> fp8 -> + # expand by 8x via `cutlass_pack_scale_fp8` + # and construct per-channel fp32 scales. weight_scale_args = { "weight_loader": weight_loader, "data": torch.empty( output_size_per_partition, scales_and_zp_size, - dtype=torch.float8_e4m3fn, + dtype=params_dtype, ), } @@ -152,17 +153,9 @@ class CompressedTensorsW4A8Fp8(CompressedTensorsScheme): data=torch.empty(2, dtype=torch.int64), weight_loader=weight_loader ) - # per-channel scales - weight_chan_scale = ChannelQuantScaleParameter( - data=torch.empty((output_size_per_partition, 1), dtype=torch.float32), - output_dim=0, - weight_loader=weight_loader, - ) - layer.register_parameter("weight_packed", weight) layer.register_parameter("weight_scale", weight_scale) layer.register_parameter("weight_shape", weight_shape) - layer.register_parameter("weight_chan_scale", weight_chan_scale) self.kernel = kernel_type( mp_linear_kernel_config, diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py index 7ebe40ec84687..11097cf36f5ca 100644 --- a/vllm/model_executor/layers/quantization/experts_int8.py +++ b/vllm/model_executor/layers/quantization/experts_int8.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Optional import torch @@ -140,23 +139,6 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts @@ -172,10 +154,10 @@ class ExpertsInt8MoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 419ddd91b64e0..60dde9eb57e0f 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from enum import Enum from functools import partial from typing import TYPE_CHECKING, Any, Optional @@ -95,7 +94,7 @@ from vllm.model_executor.parameter import ( ModelWeightParameter, PerTensorScaleParameter, ) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform from vllm.scalar_type import scalar_types from vllm.utils.deep_gemm import ( @@ -549,46 +548,50 @@ class Fp8LinearMethod(LinearMethodBase): assert not self.act_q_static size_k_first = False - weight, weight_scale = process_fp8_weight_block_strategy( + weight, weight_scale_inv = process_fp8_weight_block_strategy( layer.weight, layer.weight_scale_inv ) - # Delete the weight_scale_inv parameter to avoid confusion - # with the weight_scale parameter - del layer.weight_scale_inv + + # Update layer with new values + replace_parameter(layer, "weight", weight.data) + replace_parameter(layer, "weight_scale_inv", weight_scale_inv.data) # If checkpoint not serialized fp8, quantize the weights. - elif not self.quant_config.is_checkpoint_fp8_serialized: - qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) - weight = qweight.t() - - # If checkpoint is fp8 per-tensor, handle that there are N scales for N - # shards in a fused module else: - weight = layer.weight - weight_scale = layer.weight_scale + if not self.quant_config.is_checkpoint_fp8_serialized: + qweight, weight_scale = ops.scaled_fp8_quant(layer.weight, scale=None) + weight = qweight.t() - # If using w8a8, torch._scaled_mm needs per tensor, so - # requantize the logical shards as a single weight. - if not self.use_marlin: - weight, weight_scale, input_scale = process_fp8_weight_tensor_strategy( - weight, - weight_scale, - layer.logical_widths, - getattr(layer, "input_scale", None), - ) - if self.act_q_static: - assert input_scale is not None - input_scale = input_scale.max() - weight = weight.t() + # If checkpoint is fp8 per-tensor, handle that there are N scales for N + # shards in a fused module + else: + weight = layer.weight + weight_scale = layer.weight_scale - # Update layer with new values. - layer.weight = Parameter(weight.data, requires_grad=False) - layer.weight_scale = Parameter(weight_scale.data, requires_grad=False) - layer.input_scale = ( - Parameter(input_scale, requires_grad=False) - if input_scale is not None - else None - ) + # If using w8a8, torch._scaled_mm needs per tensor, so + # requantize the logical shards as a single weight. + if not self.use_marlin: + weight, weight_scale, input_scale = ( + process_fp8_weight_tensor_strategy( + weight, + weight_scale, + layer.logical_widths, + getattr(layer, "input_scale", None), + ) + ) + if self.act_q_static: + assert input_scale is not None + input_scale = input_scale.max() + weight = weight.t() + + # Update layer with new values. + replace_parameter(layer, "weight", weight.data) + replace_parameter(layer, "weight_scale", weight_scale.data) + + if input_scale is not None: + replace_parameter(layer, "input_scale", input_scale) + else: + layer.input_scale = None if self.use_marlin: prepare_fp8_layer_for_marlin( @@ -615,7 +618,7 @@ class Fp8LinearMethod(LinearMethodBase): return self.w8a8_block_fp8_linear.apply( input=x, weight=layer.weight, - weight_scale=layer.weight_scale, + weight_scale=layer.weight_scale_inv, input_scale=layer.input_scale, bias=bias, ) @@ -644,10 +647,15 @@ class Fp8LinearMethod(LinearMethodBase): return torch.nn.functional.linear(x, weight_bf16.t(), bias) if self.use_marlin: + if self.block_quant: + weight_scale = layer.weight_scale_inv + else: + weight_scale = layer.weight_scale + return apply_fp8_marlin_linear( input=x, weight=layer.weight, - weight_scale=layer.weight_scale, + weight_scale=weight_scale, workspace=layer.workspace, size_n=layer.output_size_per_partition, size_k=layer.input_size_per_partition, @@ -661,7 +669,7 @@ class Fp8LinearMethod(LinearMethodBase): return self.w8a8_block_fp8_linear.apply( input=x, weight=layer.weight, - weight_scale=layer.weight_scale, + weight_scale=layer.weight_scale_inv, input_scale=layer.input_scale, bias=bias, ) @@ -938,22 +946,18 @@ class Fp8MoEMethod(FusedMoEMethodBase): w2_weight_scale_inv = layer.w2_weight_scale_inv # torch.compile() cannot use Parameter subclasses. - layer.w13_weight = Parameter(w13_weight, requires_grad=False) - layer.w13_weight_scale_inv = Parameter( - w13_weight_scale_inv, requires_grad=False - ) - layer.w2_weight = Parameter(w2_weight, requires_grad=False) - layer.w2_weight_scale_inv = Parameter( - w2_weight_scale_inv, requires_grad=False - ) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w13_weight_scale_inv", w13_weight_scale_inv) + replace_parameter(layer, "w2_weight", w2_weight) + replace_parameter(layer, "w2_weight_scale_inv", w2_weight_scale_inv) if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight.data, layer.w2_weight.data ) - layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + replace_parameter(layer, "w13_weight", shuffled_w13) + replace_parameter(layer, "w2_weight", shuffled_w2) # DeepGemm scales need to be transposed and aligned. We try to do # it ahead of time for performance reasons. @@ -991,13 +995,14 @@ class Fp8MoEMethod(FusedMoEMethodBase): # Re-initialize w13_scale because we directly quantize # merged w13 weights and generate a single scaling factor. - layer.w13_weight_scale = torch.nn.Parameter( + replace_parameter( + layer, + "w13_weight_scale", torch.ones( layer.local_num_experts, dtype=torch.float32, device=w13_weight.device, ), - requires_grad=False, ) for expert in range(layer.local_num_experts): w13_weight[expert, :, :], layer.w13_weight_scale[expert] = ( @@ -1006,16 +1011,17 @@ class Fp8MoEMethod(FusedMoEMethodBase): w2_weight[expert, :, :], layer.w2_weight_scale[expert] = ( ops.scaled_fp8_quant(layer.w2_weight.data[expert, :, :]) ) - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w2_weight", w2_weight) + if self.rocm_aiter_moe_enabled: # reshaping weights is required for aiter moe kernel. shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( layer.w13_weight, layer.w2_weight ) - layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + replace_parameter(layer, "w13_weight", shuffled_w13) + replace_parameter(layer, "w2_weight", shuffled_w2) # If checkpoint is fp8, we need to handle that the # MoE kernels require single activation scale and single weight # scale for w13 per expert. @@ -1036,12 +1042,8 @@ class Fp8MoEMethod(FusedMoEMethodBase): "fp8 MoE layer. Using the maximum across experts " "for each layer." ) - layer.w13_input_scale = torch.nn.Parameter( - layer.w13_input_scale.max(), requires_grad=False - ) - layer.w2_input_scale = torch.nn.Parameter( - layer.w2_input_scale.max(), requires_grad=False - ) + replace_parameter(layer, "w13_input_scale", layer.w13_input_scale.max()) + replace_parameter(layer, "w2_input_scale", layer.w2_input_scale.max()) if current_platform.is_fp8_fnuz(): # Normalize the weights and scales w13_weight, w13_weight_scale, w13_input_scale = ( @@ -1055,22 +1057,14 @@ class Fp8MoEMethod(FusedMoEMethodBase): ) ) # Reset the parameter - layer.w13_weight = torch.nn.Parameter(w13_weight, requires_grad=False) - layer.w13_weight_scale = torch.nn.Parameter( - w13_weight_scale, requires_grad=False - ) + replace_parameter(layer, "w13_weight", w13_weight) + replace_parameter(layer, "w13_weight_scale", w13_weight_scale) if w13_input_scale is not None: - layer.w13_input_scale = torch.nn.Parameter( - w13_input_scale, requires_grad=False - ) - layer.w2_weight = torch.nn.Parameter(w2_weight, requires_grad=False) - layer.w2_weight_scale = torch.nn.Parameter( - w2_weight_scale, requires_grad=False - ) + replace_parameter(layer, "w13_input_scale", w13_input_scale) + replace_parameter(layer, "w2_weight", w2_weight) + replace_parameter(layer, "w2_weight_scale", w2_weight_scale) if w2_input_scale is not None: - layer.w2_input_scale = torch.nn.Parameter( - w2_input_scale, requires_grad=False - ) + replace_parameter(layer, "w2_input_scale", w2_input_scale) # Fp8 moe kernel needs single weight scale for w13 per expert. # We take the max then dequant and requant each expert. @@ -1094,12 +1088,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w13_weight, layer.w2_weight ) - layer.w13_weight = torch.nn.Parameter(shuffled_w13, requires_grad=False) - layer.w2_weight = torch.nn.Parameter(shuffled_w2, requires_grad=False) + replace_parameter(layer, "w13_weight", shuffled_w13) + replace_parameter(layer, "w2_weight", shuffled_w2) - layer.w13_weight_scale = torch.nn.Parameter( - max_w13_scales, requires_grad=False - ) + replace_parameter(layer, "w13_weight_scale", max_w13_scales) if self.flashinfer_moe_backend is not None: # NOTE: weights have to be swapped since the activation is @@ -1242,41 +1234,20 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: - assert expert_load_view is not None - assert logical_to_physical_map is not None - assert logical_replica_count is not None - assert isinstance(layer, FusedMoE) - if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: - assert activation == "silu", ( - f"Expected 'silu' activation but got {activation}" + if layer.enable_eplb: + raise NotImplementedError("EPLB not supported for `Fp8MoEMethod` yet.") + assert layer.activation == "silu", ( + f"Expected 'silu' activation but got {layer.activation}" ) if self.block_quant: import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 e_score_correction_bias = ( - e_score_correction_bias.to(x.dtype) - if e_score_correction_bias is not None + layer.e_score_correction_bias.to(x.dtype) + if layer.e_score_correction_bias is not None else None ) routing_method_type = layer.routing_method_type @@ -1290,29 +1261,31 @@ class Fp8MoEMethod(FusedMoEMethodBase): w13_weight_scale_inv=layer.w13_weight_scale_inv, w2_weight=layer.w2_weight, w2_weight_scale_inv=layer.w2_weight_scale_inv, - global_num_experts=global_num_experts, - top_k=top_k, - num_expert_group=num_expert_group, - topk_group=topk_group, + global_num_experts=layer.global_num_experts, + top_k=layer.top_k, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, intermediate_size=layer.intermediate_size_per_partition, expert_offset=layer.ep_rank * layer.local_num_experts, local_num_experts=layer.local_num_experts, block_shape=self.weight_block_size, routing_method_type=routing_method_type, - routed_scaling=routed_scaling_factor, + routed_scaling=layer.routed_scaling_factor, ) else: - assert not renormalize and custom_routing_function is not None + assert ( + not layer.renormalize and layer.custom_routing_function is not None + ) result = apply_flashinfer_per_tensor_scale_fp8( layer=layer, hidden_states=x, router_logits=router_logits, - routing_bias=e_score_correction_bias, - global_num_experts=global_num_experts, - top_k=top_k, - num_expert_group=num_expert_group, - topk_group=topk_group, - apply_router_weight_on_input=apply_router_weight_on_input, + routing_bias=layer.e_score_correction_bias, + global_num_experts=layer.global_num_experts, + top_k=layer.top_k, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) select_result = layer.select_experts( @@ -1333,13 +1306,15 @@ class Fp8MoEMethod(FusedMoEMethodBase): layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) elif self.use_marlin: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) result = fused_marlin_moe( x, layer.w13_weight, @@ -1352,20 +1327,22 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, workspace=layer.workspace, ) elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert activation == "silu", ( - f"Expected 'silu' activation but got {activation}" + assert layer.activation == "silu", ( + f"Expected 'silu' activation but got {layer.activation}" ) if not self.block_quant: - assert not renormalize and custom_routing_function is not None - assert scoring_func == "sigmoid", ( - f"Expected 'sigmoid' scoring func but got {scoring_func}" + assert ( + not layer.renormalize and layer.custom_routing_function is not None + ) + assert layer.scoring_func == "sigmoid", ( + f"Expected 'sigmoid' scoring func but got {layer.scoring_func}" ) # Delegate to CUTLASS FlashInfer path; function already bound with # use_deepseek_fp8_block_scale for block-quant when applicable @@ -1375,10 +1352,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: from vllm.model_executor.layers.fused_moe import fused_experts @@ -1390,10 +1367,10 @@ class Fp8MoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, allow_deep_gemm=self.allow_deep_gemm, allow_cutlass_block_scaled_grouped_gemm=( diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py index bcdfafb50fc5a..13aa2bcad21ba 100644 --- a/vllm/model_executor/layers/quantization/gguf.py +++ b/vllm/model_executor/layers/quantization/gguf.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable, Mapping +from collections.abc import Mapping from types import MappingProxyType from typing import Any, Optional @@ -82,6 +82,7 @@ class GGUFConfig(QuantizationConfig): return UnquantizedEmbeddingMethod() return GGUFEmbeddingMethod(self) elif isinstance(layer, FusedMoE): + # TODO: Select UnquantizedFusedMoEMethod on unquantized layers. return GGUFMoEMethod(self, layer.moe_config) return None @@ -624,26 +625,9 @@ class GGUFMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." - if apply_router_weight_on_input: + assert layer.activation == "silu", "Only SiLU activation is supported." + if layer.apply_router_weight_on_input: raise NotImplementedError( "Apply router weight on input is not supported for" "fused GGUF MoE method." @@ -661,7 +645,7 @@ class GGUFMoEMethod(FusedMoEMethodBase): topk_ids, layer.w13_qweight_type.weight_type, layer.w2_qweight_type.weight_type, - activation, + layer.activation, ) diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 56034e11329dc..8d1715f52f097 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from copy import deepcopy from typing import Any, Optional @@ -790,25 +789,8 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -829,9 +811,9 @@ class GPTQMarlinMoEMethod(FusedMoEMethodBase): input_global_scale1=getattr(layer, "w13_input_global_scale", None), input_global_scale2=getattr(layer, "w2_input_global_scale", None), quant_type_id=self.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, g_idx1=layer.w13_g_idx, g_idx2=layer.w2_g_idx, sort_indices1=layer.w13_g_idx_sort_indices, diff --git a/vllm/model_executor/layers/quantization/input_quant_fp8.py b/vllm/model_executor/layers/quantization/input_quant_fp8.py index 7ded8eea79060..a5db086fb4729 100644 --- a/vllm/model_executor/layers/quantization/input_quant_fp8.py +++ b/vllm/model_executor/layers/quantization/input_quant_fp8.py @@ -5,6 +5,7 @@ import torch import torch.nn.functional as F from vllm import _custom_ops as ops +from vllm._aiter_ops import rocm_aiter_ops from vllm.model_executor.custom_op import CustomOp from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape from vllm.platforms import current_platform @@ -45,10 +46,13 @@ class QuantFP8(CustomOp): super().__init__() self.static = static self.group_shape = group_shape + self.use_per_token_if_dynamic = group_shape == GroupShape.PER_TOKEN self.num_token_padding = num_token_padding self.column_major_scales = column_major_scales self.use_ue8m0 = use_ue8m0 + self.use_aiter = rocm_aiter_ops.is_linear_fp8_enaled() + self.is_group_quant = group_shape.is_per_group() if self.is_group_quant: assert not static, "Group quantization only supports dynamic mode" @@ -92,6 +96,33 @@ class QuantFP8(CustomOp): use_per_token_if_dynamic=self.use_per_token_if_dynamic, ) + def forward_hip( + self, + x: torch.Tensor, + scale: torch.Tensor | None = None, + scale_ub: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + use_aiter_quant = ( + not self.is_group_quant + and self.use_aiter + and scale_ub is None + and x.is_contiguous() + ) + use_aiter_per_tensor_quant = ( + use_aiter_quant and self.group_shape == GroupShape.PER_TENSOR + ) + use_aiter_per_token_quant = ( + use_aiter_quant and self.group_shape == GroupShape.PER_TOKEN + ) + + if use_aiter_per_tensor_quant: + return rocm_aiter_ops.per_tensor_quant(x, _FP8_DTYPE, scale) + if use_aiter_per_token_quant: + return rocm_aiter_ops.per_token_quant(x, _FP8_DTYPE, scale) + + # Fallback to CUDA implementation + return self.forward_cuda(x, scale, scale_ub) + def forward_native( self, x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py index a1571afba2974..463c74c1c1482 100644 --- a/vllm/model_executor/layers/quantization/ipex_quant.py +++ b/vllm/model_executor/layers/quantization/ipex_quant.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Optional import torch @@ -440,31 +439,14 @@ class XPUFp8MoEMethod(FusedMoEMethodBase): layer: torch.nn.Module, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor: return layer.ipex_fusion( x, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, - custom_routing_function=custom_routing_function, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, + custom_routing_function=layer.custom_routing_function, ) diff --git a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py index 8ef6457c952f1..c9c1a3abf7fd3 100644 --- a/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py +++ b/vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py @@ -6,7 +6,11 @@ import torch from vllm import _custom_ops as ops from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 -from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape +from vllm.model_executor.layers.quantization.utils.quant_utils import ( + GroupShape, + convert_bf16_scales_to_fp8, + convert_packed_uint4b8_to_signed_int4_inplace, +) from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_ from vllm.platforms import current_platform from vllm.scalar_type import scalar_types @@ -48,7 +52,6 @@ class CutlassW4A8LinearKernel(MPLinearKernel): "CUTLASS W4A8, only supported int4", ) - # TODO(czhu): support -1 (column-wise) if c.group_size != 128: return False, "Only group_size 128 is supported" @@ -71,9 +74,9 @@ class CutlassW4A8LinearKernel(MPLinearKernel): # `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0} # `weight_scale` is: {input_dim = 0, output_dim = 1} def process_weights_after_loading(self, layer: torch.nn.Module): - # TODO(czhu): optimize speed/mem usage def transform_w_q(x): assert isinstance(x, BasevLLMParameter) + convert_packed_uint4b8_to_signed_int4_inplace(x.data) permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0) x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t()) return x @@ -85,10 +88,18 @@ class CutlassW4A8LinearKernel(MPLinearKernel): x.data = ops.cutlass_pack_scale_fp8(x.data) return x + w_s = getattr(layer, self.w_s_name) + fp8_scales, chan_scales = convert_bf16_scales_to_fp8(self.quant_fp8, w_s.data) + w_s.data = fp8_scales + + # register per-channel scales + layer.register_parameter( + "weight_chan_scale", torch.nn.Parameter(chan_scales, requires_grad=False) + ) + # Encode/reorder weights and pack scales self._transform_param(layer, self.w_q_name, transform_w_q) self._transform_param(layer, self.w_s_name, transform_w_s) - self._transform_param(layer, "weight_chan_scale", lambda x: x) def apply_weights( self, diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index 78456dcf1ca56..f0497a8722909 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -45,6 +45,13 @@ class BaseKVCacheMethod(QuantizeMethodBase): raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.") def process_weights_after_loading(self, layer: torch.nn.Module) -> None: + # skip if there are no weights to process (for example, weight reloading) + if not hasattr(layer, "q_scale"): + assert not hasattr(layer, "k_scale") + assert not hasattr(layer, "v_scale") + assert not hasattr(layer, "prob_scale") + return + # If the kv-cache dtype is auto, we enforce the k/v_scale to be 1.0 # regardless whether the kv-scale is available in the checkpoint. # No need to process kv scales after loading if we are going to diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index 034e97a713cdd..e825cb33c3580 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from fnmatch import fnmatch from typing import TYPE_CHECKING, Any, Optional @@ -707,43 +706,27 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptFp8MoEMethod` yet." ) - assert activation == "silu", ( - f"Expected 'silu' activation but got {activation}" + assert layer.activation == "silu", ( + f"Expected 'silu' activation but got {layer.activation}" ) - assert not renormalize + + assert not layer.renormalize return apply_flashinfer_per_tensor_scale_fp8( layer=layer, hidden_states=x, router_logits=router_logits, - routing_bias=e_score_correction_bias, - global_num_experts=global_num_experts, - top_k=top_k, - num_expert_group=num_expert_group, - topk_group=topk_group, - apply_router_weight_on_input=apply_router_weight_on_input, + routing_bias=layer.e_score_correction_bias, + global_num_experts=layer.global_num_experts, + top_k=layer.top_k, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) # Expert selection @@ -753,9 +736,9 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): ) if self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - assert activation in ("silu", "relu2_no_mul"), ( + assert layer.activation in ("silu", "relu2_no_mul"), ( "Expected activation to be in ('silu', 'relu2_no_mul')," - f"but got {activation}" + f"but got {layer.activation}" ) return flashinfer_cutlass_moe_fp8( x, @@ -763,10 +746,10 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts @@ -780,11 +763,11 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, + activation=layer.activation, quant_config=self.moe_quant_config, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) @@ -1504,23 +1487,6 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: if not self.moe.is_act_and_mul: assert ( @@ -1535,7 +1501,7 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): - if enable_eplb: + if layer.enable_eplb: raise NotImplementedError( "EPLB not supported for `ModelOptNvFp4FusedMoE` yet." ) @@ -1543,12 +1509,12 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): layer=layer, x=x, router_logits=router_logits, - top_k=top_k, - global_num_experts=global_num_experts, - num_expert_group=num_expert_group, - topk_group=topk_group, - custom_routing_function=custom_routing_function, - e_score_correction_bias=e_score_correction_bias, + top_k=layer.top_k, + global_num_experts=layer.global_num_experts, + num_expert_group=layer.num_expert_group, + topk_group=layer.topk_group, + custom_routing_function=layer.custom_routing_function, + e_score_correction_bias=layer.e_score_correction_bias, ) topk_weights, topk_ids, _ = layer.select_experts( @@ -1571,9 +1537,9 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): global_scale1=layer.w13_weight_scale_2, global_scale2=layer.w2_weight_scale_2, quant_type_id=scalar_types.float4_e2m1f.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, ) @@ -1604,10 +1570,10 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): topk_ids=topk_ids, quant_config=self.moe_quant_config, inplace=False, - activation=activation, - global_num_experts=global_num_experts, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: # If no modular kernel is provided, use cutlass_moe_fp4 for TP case @@ -1622,8 +1588,8 @@ class ModelOptNvFp4FusedMoE(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, quant_config=self.moe_quant_config, - expert_map=expert_map, - apply_router_weight_on_input=apply_router_weight_on_input, + expert_map=layer.expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, # TODO: derive from arguments m=x.shape[0], n=layer.w2_weight.shape[2] * 2, diff --git a/vllm/model_executor/layers/quantization/moe_wna16.py b/vllm/model_executor/layers/quantization/moe_wna16.py index cf348290a2716..0131a330f70d2 100644 --- a/vllm/model_executor/layers/quantization/moe_wna16.py +++ b/vllm/model_executor/layers/quantization/moe_wna16.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any, Optional import torch @@ -60,7 +59,7 @@ class MoeWNA16Config(QuantizationConfig): if self.linear_quant_method == "gptq": self.use_marlin = GPTQMarlinConfig.is_gptq_marlin_compatible(full_config) - elif self.linear_quant_method == "awq": + elif self.linear_quant_method in ("awq", "awq_marlin"): capability_tuple = current_platform.get_device_capability() device_capability = ( -1 if capability_tuple is None else capability_tuple.to_int() @@ -107,7 +106,7 @@ class MoeWNA16Config(QuantizationConfig): if linear_quant_method == "gptq": has_zp = not cls.get_from_keys(config, ["sym"]) modules_to_not_convert = [] - elif linear_quant_method == "awq": + elif linear_quant_method in ("awq", "awq_marlin"): has_zp = cls.get_from_keys(config, ["zero_point"]) modules_to_not_convert = cls.get_from_keys_or( config, ["modules_to_not_convert"], None @@ -184,7 +183,7 @@ class MoeWNA16Config(QuantizationConfig): return GPTQConfig.from_config(self.full_config).get_quant_method( layer, prefix ) - elif self.linear_quant_method == "awq": + elif self.linear_quant_method in ("awq", "awq_marlin"): if self.use_marlin and check_marlin_supports_layer( layer, self.group_size ): @@ -362,27 +361,10 @@ class MoeWNA16Method(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: from vllm.model_executor.layers.fused_moe import fused_experts - assert activation == "silu", "Only SiLU activation is supported." + assert layer.activation == "silu", "Only SiLU activation is supported." topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, router_logits=router_logits, @@ -395,9 +377,9 @@ class MoeWNA16Method(FusedMoEMethodBase): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -468,7 +450,8 @@ class MoeWNA16Method(FusedMoEMethodBase): shard_size = layer.intermediate_size_per_partition # convert gptq and awq weight to a standard format - if layer.quant_config.linear_quant_method == "awq": + # awq_marlin uses the same weight format as awq + if layer.quant_config.linear_quant_method in ("awq", "awq_marlin"): assert layer.quant_config.weight_bits == 4 if "weight" in weight_name: loaded_weight = convert_awq_tensor(loaded_weight, "qweight") diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 5d330e837eea0..6eae4e9e66e1b 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -1,6 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from enum import Enum from typing import Optional @@ -892,25 +891,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: - if enable_eplb: + if layer.enable_eplb: raise NotImplementedError("EPLB is not supported for mxfp4") if self.mxfp4_backend == Mxfp4Backend.MARLIN: @@ -933,26 +915,26 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): global_scale1=None, global_scale2=None, quant_type_id=scalar_types.float4_e2m1f.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - activation=activation, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + activation=layer.activation, + expert_map=layer.expert_map, input_dtype=self.marlin_input_dtype, ) assert _can_support_mxfp4( - use_grouped_topk, - topk_group, - num_expert_group, - expert_map, - custom_routing_function, - e_score_correction_bias, - apply_router_weight_on_input, - scoring_func, - activation, - expert_load_view, - logical_to_physical_map, - logical_replica_count, + layer.use_grouped_topk, + layer.topk_group, + layer.num_expert_group, + layer.expert_map, + layer.custom_routing_function, + layer.e_score_correction_bias, + layer.apply_router_weight_on_input, + layer.scoring_func, + layer.activation, + layer.expert_load_view, + layer.logical_to_physical_map, + layer.logical_replica_count, ), "MXFP4 are not supported with this configuration." if ( @@ -988,8 +970,8 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): None, # output1_scale_scalar None, # output1_scale_gate_scalar None, # output2_scale_scalar - global_num_experts, - top_k, + layer.global_num_experts, + layer.top_k, None, # n_group None, # topk_group self.intermediate_size, # padded to multiple of 256 @@ -997,7 +979,7 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): self.num_experts, # local num experts None, None, - 1 if renormalize else 0, # routing_method_type, renormalize + 1 if layer.renormalize else 0, # routing_method_type, renormalize True, # do finalize tune_max_num_tokens=max(self.max_capture_size, 1), )[0] @@ -1081,12 +1063,12 @@ class Mxfp4MoEMethod(FusedMoEMethodBase): w1=layer.w13_weight, w2=layer.w2_weight, gating_output=router_logits, - topk=top_k, - renormalize=renormalize, - global_num_experts=global_num_experts, - expert_map=expert_map, + topk=layer.top_k, + renormalize=layer.renormalize, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, - apply_router_weight_on_input=apply_router_weight_on_input, + apply_router_weight_on_input=layer.apply_router_weight_on_input, ) else: raise ValueError(f"Unsupported backend: {self.mxfp4_backend}") @@ -1138,37 +1120,20 @@ class IpexMxfp4MoEMethod(Mxfp4MoEMethod): layer: torch.nn.Module, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor: - assert activation == "swigluoai", ( + assert layer.activation == "swigluoai", ( "Only swiglu_oai activation is supported for IPEX MXFP4 MoE" ) hidden_size_pad = round_up(self.original_hidden_size, 128) x_pad = torch.nn.functional.pad(x, (0, hidden_size_pad - x.size(-1))) hidden_states = layer.ipex_fusion( x_pad, - use_grouped_topk, - top_k, + layer.use_grouped_topk, + layer.top_k, router_logits, - renormalize, - topk_group, - num_expert_group, + layer.renormalize, + layer.topk_group, + layer.num_expert_group, activation="swiglu_oai", ) hidden_states = hidden_states[..., : self.original_hidden_size].contiguous() diff --git a/vllm/model_executor/layers/quantization/quark/quark_moe.py b/vllm/model_executor/layers/quantization/quark/quark_moe.py index 9e2b2134310fc..d84e22d1fa0f2 100644 --- a/vllm/model_executor/layers/quantization/quark/quark_moe.py +++ b/vllm/model_executor/layers/quantization/quark/quark_moe.py @@ -1,7 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Callable from typing import Any import torch @@ -337,23 +336,6 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -371,13 +353,15 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): w2=layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, quant_config=self.moe_quant_config, - expert_map=expert_map, + expert_map=layer.expert_map, ) elif self.use_marlin: - assert activation == "silu", f"{activation} not supported for Marlin MoE." + assert layer.activation == "silu", ( + f"{layer.activation} not supported for Marlin MoE." + ) return fused_marlin_moe( x, layer.w13_weight, @@ -390,9 +374,9 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): topk_weights, topk_ids, quant_type_id=scalar_types.float8_e4m3fn.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, ) else: from vllm.model_executor.layers.fused_moe import fused_experts @@ -404,10 +388,10 @@ class QuarkW8A8Fp8MoEMethod(QuarkMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) @@ -597,23 +581,6 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -631,9 +598,9 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): layer.w2_weight, topk_weights=topk_weights, topk_ids=topk_ids, - activation=activation, + activation=layer.activation, quant_config=self.moe_quant_config, - expert_map=expert_map, + expert_map=layer.expert_map, ) else: from vllm.model_executor.layers.fused_moe import fused_experts @@ -645,10 +612,11 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod): topk_weights=topk_weights, topk_ids=topk_ids, inplace=True, - activation=activation, - global_num_experts=global_num_experts, - apply_router_weight_on_input=apply_router_weight_on_input, - expert_map=expert_map, + activation=layer.activation, + global_num_experts=layer.global_num_experts, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + expert_map=layer.expert_map, quant_config=self.moe_quant_config, ) + return out diff --git a/vllm/model_executor/layers/quantization/rtn.py b/vllm/model_executor/layers/quantization/rtn.py index 7b51b828009fc..b2ecb0b175f81 100644 --- a/vllm/model_executor/layers/quantization/rtn.py +++ b/vllm/model_executor/layers/quantization/rtn.py @@ -3,7 +3,6 @@ # Copyright © 2025, Oracle and/or its affiliates. import os -from collections.abc import Callable from typing import Any, Optional import numpy as np @@ -359,23 +358,6 @@ class RTNMoEMethod(FusedMoEMethodBase): layer: FusedMoE, x: torch.Tensor, router_logits: torch.Tensor, - top_k: int, - renormalize: bool, - use_grouped_topk: bool = False, - topk_group: int | None = None, - num_expert_group: int | None = None, - global_num_experts: int = -1, - expert_map: torch.Tensor | None = None, - custom_routing_function: Callable | None = None, - scoring_func: str = "softmax", - routed_scaling_factor: float = 1.0, - e_score_correction_bias: torch.Tensor | None = None, - apply_router_weight_on_input: bool = False, - activation: str = "silu", - enable_eplb: bool = False, - expert_load_view: torch.Tensor | None = None, - logical_to_physical_map: torch.Tensor | None = None, - logical_replica_count: torch.Tensor | None = None, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: topk_weights, topk_ids, _ = layer.select_experts( hidden_states=x, @@ -394,9 +376,9 @@ class RTNMoEMethod(FusedMoEMethodBase): topk_weights, topk_ids, quant_type_id=self.quant_config.quant_type.id, - apply_router_weight_on_input=apply_router_weight_on_input, - global_num_experts=global_num_experts, - expert_map=expert_map, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, workspace=workspace, ) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index ad92f4ec63c34..e12fe61bf3d97 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -27,6 +27,7 @@ from vllm.model_executor.parameter import ( ChannelQuantScaleParameter, PerTensorScaleParameter, ) +from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import ( @@ -194,6 +195,39 @@ direct_register_custom_op( ) +def _triton_per_token_group_quant_fp8_impl( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + return per_token_group_quant_fp8( + x, group_size, column_major_scales=False, use_ue8m0=False + ) + + +def _triton_per_token_group_quant_fp8_fake( + x: torch.Tensor, + group_size: int, +) -> tuple[torch.Tensor, torch.Tensor]: + M, N = x.shape + x_fp8 = torch.empty((M, N), dtype=current_platform.fp8_dtype(), device=x.device) + out_bs = torch.empty( + ( + M, + (N + group_size - 1) // group_size, + ), + dtype=torch.float32, + device=x.device, + ) + return x_fp8, out_bs + + +direct_register_custom_op( + "triton_per_token_group_quant_fp8", + _triton_per_token_group_quant_fp8_impl, + fake_impl=_triton_per_token_group_quant_fp8_fake, +) + + # TODO fix ROCm->Triton custom path: # https://github.com/vllm-project/vllm/issues/14397 class W8A8BlockFp8LinearOp: @@ -213,6 +247,7 @@ class W8A8BlockFp8LinearOp: self.act_quant_group_shape = act_quant_group_shape self.is_deep_gemm_supported = is_deep_gemm_supported() self.is_hopper = current_platform.is_device_capability(90) + self.is_blackwell = current_platform.is_device_capability(100) self.use_deep_gemm_e8m0 = is_deep_gemm_e8m0_used() # Get the correct blockscale mul and input quant operations. @@ -268,12 +303,15 @@ class W8A8BlockFp8LinearOp: weight: torch.Tensor, weight_scale: torch.Tensor, ) -> torch.Tensor: - assert self.deepgemm_input_quant_op is not None - q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( - input_2d, - group_size=self.act_quant_group_shape.col, - use_ue8m0=True, - ) + if self.use_deep_gemm_e8m0 and self.is_blackwell: + q_input, input_scale = per_token_group_quant_fp8_packed_for_deepgemm( + input_2d, + group_size=self.act_quant_group_shape.col, + use_ue8m0=True, + ) + else: + assert self.deepgemm_input_quant_op is not None + q_input, input_scale = self.deepgemm_input_quant_op(input_2d) output = torch.empty( (q_input.shape[0], weight.shape[0]), dtype=torch.bfloat16, @@ -336,17 +374,15 @@ class W8A8BlockFp8LinearOp: if input_scale is not None: q_input = input_2d - # MI350 case uses triton kernel elif use_triton: - q_input, input_scale = per_token_group_quant_fp8( + q_input, input_scale = torch.ops.vllm.triton_per_token_group_quant_fp8( input_2d, self.act_quant_group_shape.col, - column_major_scales=False, - use_ue8m0=False, ) - # MI300 uses tuned AITER ASM/C++ kernel else: - q_input, input_scale = rocm_aiter_ops.group_fp8_quant(input_2d) + q_input, input_scale = rocm_aiter_ops.group_fp8_quant( + input_2d, self.act_quant_group_shape.col + ) return gemm_a8w8_blockscale_op( q_input, @@ -1400,12 +1436,12 @@ def maybe_post_process_fp8_weight_block(layer: torch.nn.Module): if should_use_deepgemm: dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block( wq=layer.weight.data, - ws=layer.weight_scale.data, + ws=layer.weight_scale_inv.data, quant_block_shape=tuple(layer.weight_block_size), use_e8m0=is_deep_gemm_e8m0_used(), ) - layer.weight = torch.nn.Parameter(dg_weight, requires_grad=False) - layer.weight_scale = torch.nn.Parameter(dg_weight_scale, requires_grad=False) + replace_parameter(layer, "weight", dg_weight) + replace_parameter(layer, "weight_scale_inv", dg_weight_scale) def expert_weight_is_col_major(x: torch.Tensor) -> bool: diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py index e6b4f567caea4..c67e4f437cf0c 100644 --- a/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py +++ b/vllm/model_executor/layers/quantization/utils/marlin_utils_fp8.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import ( marlin_quant_input, should_use_atomic_add_reduce, ) +from vllm.model_executor.utils import replace_parameter from vllm.platforms import current_platform from vllm.scalar_type import scalar_types @@ -130,7 +131,7 @@ def prepare_fp8_layer_for_marlin( size_n=part_size_n, num_bits=8, ) - layer.weight = torch.nn.Parameter(marlin_qweight, requires_grad=False) + replace_parameter(layer, "weight", marlin_qweight) # WEIGHT SCALES # Permute scales @@ -138,7 +139,6 @@ def prepare_fp8_layer_for_marlin( scales = layer.weight_scale.to(layer.orig_dtype) elif "weight_scale_inv" in dir(layer): scales = layer.weight_scale_inv.to(layer.orig_dtype) - del layer.weight_scale_inv group_size = -1 if weight_block_size is None else weight_block_size[1] @@ -177,12 +177,15 @@ def prepare_fp8_layer_for_marlin( ) if input_dtype != torch.float8_e4m3fn: marlin_scales = fp8_fused_exponent_bias_into_scales(marlin_scales) - layer.weight_scale = torch.nn.Parameter(marlin_scales, requires_grad=False) + if hasattr(layer, "weight_scale"): + replace_parameter(layer, "weight_scale", marlin_scales) + elif hasattr(layer, "weight_scale_inv"): + replace_parameter(layer, "weight_scale_inv", marlin_scales) if hasattr(layer, "bias") and layer.bias is not None: assert layer.bias.shape == (part_size_n,) bias = marlin_permute_bias(layer.bias) - layer.bias = torch.nn.Parameter(bias, requires_grad=False) + replace_parameter(layer, "bias", bias) def prepare_moe_fp8_layer_for_marlin( diff --git a/vllm/model_executor/layers/quantization/utils/quant_utils.py b/vllm/model_executor/layers/quantization/utils/quant_utils.py index 92ee8c498e01f..d01263f82007d 100644 --- a/vllm/model_executor/layers/quantization/utils/quant_utils.py +++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """This file is used for /tests and /benchmarks""" -from collections.abc import Mapping +from collections.abc import Callable, Mapping from dataclasses import dataclass from types import MappingProxyType from typing import ClassVar, NamedTuple @@ -691,3 +691,51 @@ def cutlass_fp4_supported() -> bool: capability_tuple = current_platform.get_device_capability() capability = -1 if capability_tuple is None else capability_tuple.to_int() return cutlass_scaled_mm_supports_fp4(capability) + + +def convert_bf16_scales_to_fp8( + quant_fp8: Callable, scales: torch.Tensor +) -> tuple[torch.Tensor, torch.Tensor]: + """ + Convert a BF16 scale tensor into the pair of (fp8_scales, channel_scales) + expected by W4A8 GEMM kernels. + """ + assert scales.is_contiguous(), ( + f"scale tensor must be contiguous, got {scales.stride()=}" + ) + assert scales.is_cuda, "scales must be on gpu" + + orig_shape = scales.shape + k_groups = orig_shape[-1] + flat_scales = scales.view(-1, k_groups) + + fp8_scales, chan_scales = quant_fp8(flat_scales) + fp8_scales = (fp8_scales.float() / 8.0).to(torch.float8_e4m3fn) + chan_scales *= 8.0 + + # restore original shape + fp8_scales = fp8_scales.view(orig_shape) + chan_scales = chan_scales.view(orig_shape[:-1], -1) + + return fp8_scales, chan_scales + + +def convert_packed_uint4b8_to_signed_int4_inplace(t: torch.Tensor) -> torch.Tensor: + """ + Convert int4b8 (packed to int32) to signed int4 + """ + assert t.is_cuda, "tensor must be on gpu" + assert t.dtype == torch.int32, f"expected int32 packed weights but got {t.dtype}" + + # loop through the 8 4-bit nibbles in each int32 entry + for i in range(8): + shift = 4 * i + # extract the i-th 4-bit nibble + nib = (t >> shift) & 0xF + # clear the original nibble by masking out + t &= ~(0xF << shift) + # convert int4b8 [0..15] to signed int4 [-8..7] by subtracting 8 + # and update in-place + t |= ((nib - 8) & 0xF) << shift + + return t diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index fceed3e55c2df..4287922417c63 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -118,8 +118,11 @@ def requantize_with_max_scale( # from disk in this case. Skip requantization in this case (since) # we already are quantized with the single scale. # * Sample Model: nm-testing/Phi-3-mini-128k-instruct-FP8 + # + # Extra note: upon weight reloading weight_scale.ndim == 0 unfused_module_in_checkpoint = ( - weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min + weight_scale.ndim != 0 + and weight_scale[-1] > torch.finfo(torch.float8_e4m3fn).min ) # If unfused checkpoint, need requanize with the single scale. diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py index 74052f72ceab9..7f94bd234fd38 100644 --- a/vllm/model_executor/model_loader/gguf_loader.py +++ b/vllm/model_executor/model_loader/gguf_loader.py @@ -4,6 +4,7 @@ import os from collections.abc import Generator import gguf +import regex as re import torch import torch.nn as nn from huggingface_hub import hf_hub_download @@ -94,6 +95,7 @@ class GGUFModelLoader(BaseModelLoader): hasattr(config, "vision_config") and config.vision_config is not None ) gguf_to_hf_name_map = {} + sideload_params: list[re.Pattern] = [] # hack: ggufs have a different name than transformers if model_type == "cohere": model_type = "command-r" @@ -118,6 +120,12 @@ class GGUFModelLoader(BaseModelLoader): gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( f"model.layers.{idx}.mlp.experts.0.up_proj.weight" ) + sideload_params.append( + re.compile( + f"model\\.layers\\.{idx}" + r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight" + ) + ) if model_type in ("qwen2_moe", "qwen3_moe"): model_type = model_type.replace("_", "") # GGUF layer map assumes that we will have a merged expert weights @@ -132,6 +140,12 @@ class GGUFModelLoader(BaseModelLoader): gguf_to_hf_name_map[f"blk.{idx}.ffn_up_exps.weight"] = ( f"model.layers.{idx}.mlp.experts.0.up_proj.weight" ) + sideload_params.append( + re.compile( + f"model\\.layers\\.{idx}" + r"\.mlp\.experts\.[0-9]+\.(gate|up|down)_proj\.weight" + ) + ) arch = None for key, value in gguf.MODEL_ARCH_NAMES.items(): @@ -241,7 +255,15 @@ class GGUFModelLoader(BaseModelLoader): # Parameter not in manual overrides either unmapped_params.append(hf_name) - # All parameters must be mapped: both vision/projector and backbone + # All parameters (except those initialized by other means) must be mapped: + # both vision/projector and backbone + if unmapped_params: + unmapped_params = list( + filter( + lambda x: not any(re.fullmatch(p, x) for p in sideload_params), + unmapped_params, + ) + ) if unmapped_params: raise RuntimeError( f"Failed to map GGUF parameters " diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0496b7a84507b..610e6a620ade2 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -641,7 +641,6 @@ def safetensors_weights_iterator( if safetensors_load_strategy == "eager": loading_desc += " (eager)" - state_dict = {} leftover_state_dict: dict[str, torch.Tensor] = {} for st_file in tqdm( @@ -667,6 +666,7 @@ def safetensors_weights_iterator( ) with safe_open(st_file, framework="pt") as f: + state_dict = {} for name in f.keys(): # noqa: SIM118 state_dict[name] = f.get_tensor(name) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 70f203b9f7c64..9ba76f312edac 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -175,9 +175,14 @@ def _create_pooling_model_cls(orig_cls: _T) -> _T: self.vllm_config = vllm_config # These are not used in pooling models - for attr in ("lm_head", "logits_processor"): - if hasattr(self, attr): - delattr(self, attr) + objects_to_clean = [self] + if language_model := getattr(self, "language_model", None): + objects_to_clean.append(language_model) + + for obj in objects_to_clean: + for attr in ("lm_head", "logits_processor"): + if hasattr(obj, attr): + delattr(obj, attr) # If the model already defines a pooler instance, don't overwrite it if not getattr(self, "pooler", None): diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 55dd6e50ad249..8de793941b8c3 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -214,7 +214,7 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): tokens = getattr(config, "classifier_from_token", None) assert tokens is not None and len(tokens) == 2, ( "Try loading the original Qwen3 Reranker?, see: " - "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/qwen3_reranker.py" + "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py" ) vllm_config.model_config.hf_config.method = "from_2_way_softmax" diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py index 741edfdda3e2c..de091f03e881c 100644 --- a/vllm/model_executor/models/glm4_1v.py +++ b/vllm/model_executor/models/glm4_1v.py @@ -1257,6 +1257,7 @@ class Glm4vDummyInputsBuilder(BaseDummyInputsBuilder[Glm4vProcessingInfo]): ) height = min(height, overrides.height) + num_frames = max(num_frames, 2) # GLM 4.6V requires 2 frames video = np.full((num_frames, width, height, 3), 255, dtype=np.uint8) video_items = [] for i in range(num_videos): diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index e5c1be626be07..be084f4ee0f8e 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -502,6 +502,7 @@ class HunYuanVisionTransformer(nn.Module): cu_seqlens: list = [0] hidden_states = x.to(device=self.device, dtype=self.dtype) + # embeddings = patch_embeds + patch_pos_embed hidden_states = self.embeddings(hidden_states, grid_thw) for t, h, w in grid_thw: @@ -515,8 +516,14 @@ class HunYuanVisionTransformer(nn.Module): hidden_states = hidden_states.reshape(seq_len, -1) hidden_states = hidden_states.unsqueeze(0) - for layer_num, layer in enumerate(self.layers): - hidden_states = layer(hidden_states) + + # build per-image lengths once + split_lengths = [int(h) * int(w) for (_, h, w) in grid_thw] + for layer in self.layers: + # hidden_states: (1, T_total, D) + parts = hidden_states.split(split_lengths, dim=1) # list of (1, L_i, D) + parts = [layer(p) for p in parts] + hidden_states = torch.cat(parts, dim=1) # adapter split_lengths = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 607ff55835f1d..cb99d57e8b8c7 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -53,6 +53,22 @@ The output embeddings must be one of the following formats: """ +def _require_is_multimodal(is_multimodal: Tensor | None) -> Tensor: + """ + A helper function to be used in the context of + [vllm.model_executor.models.interfaces.SupportsMultiModal.embed_input_ids][] + to provide a better error message. + """ + if is_multimodal is None: + raise ValueError( + "`embed_input_ids` now requires `is_multimodal` arg, " + "please update your model runner according to " + "https://github.com/vllm-project/vllm/pull/16229." + ) + + return is_multimodal + + @runtime_checkable class SupportsMultiModal(Protocol): """The interface required for all multi-modal models.""" @@ -111,13 +127,7 @@ class SupportsMultiModal(Protocol): the appearances of their corresponding multimodal data item in the input prompt. """ - if hasattr(self, "get_multimodal_embeddings"): - logger.warning_once( - "`get_multimodal_embeddings` for vLLM models is deprecated and will be " - "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename " - "this method to `embed_multimodal`." - ) - return self.get_multimodal_embeddings(**kwargs) + ... def get_language_model(self) -> VllmModel: """ @@ -196,17 +206,10 @@ class SupportsMultiModal(Protocol): if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - if is_multimodal is None: - raise ValueError( - "`embed_input_ids` now requires `is_multimodal` arg, " - "please update your model runner according to " - "https://github.com/vllm-project/vllm/pull/16229." - ) - return _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, + is_multimodal=_require_is_multimodal(is_multimodal), ) diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py index e8d521ec2e8aa..134a1d9483804 100644 --- a/vllm/model_executor/models/interfaces_base.py +++ b/vllm/model_executor/models/interfaces_base.py @@ -49,13 +49,7 @@ class VllmModel(Protocol[T_co]): def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor: """Apply token embeddings to `input_ids`.""" - if hasattr(self, "get_input_embeddings"): - logger.warning_once( - "`get_input_embeddings` for vLLM models is deprecated and will be " - "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename " - "this method to `embed_input_ids`." - ) - return self.get_input_embeddings(input_ids) + ... def forward(self, input_ids: torch.Tensor, positions: torch.Tensor) -> T_co: ... @@ -68,15 +62,6 @@ def _check_vllm_model_init(model: type[object] | object) -> bool: def _check_vllm_model_embed_input_ids(model: type[object] | object) -> bool: model_embed_input_ids = getattr(model, "embed_input_ids", None) if not callable(model_embed_input_ids): - model_get_input_embeddings = getattr(model, "get_input_embeddings", None) - if callable(model_get_input_embeddings): - logger.warning( - "`get_input_embeddings` for vLLM models is deprecated and will be " - "removed in v0.13.0 or v1.0.0, whichever is earlier. Please rename " - "this method to `embed_input_ids`." - ) - model.embed_input_ids = model_get_input_embeddings - return True logger.warning( "The model (%s) is missing the `embed_input_ids` method.", model, diff --git a/vllm/model_executor/models/minimax_m2.py b/vllm/model_executor/models/minimax_m2.py index dd98e36ec0851..3e6a9add9ec49 100644 --- a/vllm/model_executor/models/minimax_m2.py +++ b/vllm/model_executor/models/minimax_m2.py @@ -201,7 +201,7 @@ class MiniMaxM2Attention(nn.Module): self.rotary_emb = get_rope( self.head_dim, - rotary_dim=rotary_dim, + rotary_dim=self.head_dim, max_position=max_position_embeddings, rope_parameters=rope_parameters, ) diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py index e3ca9e4ca82d0..37cd4324e53d9 100644 --- a/vllm/model_executor/models/mistral_large_3_eagle.py +++ b/vllm/model_executor/models/mistral_large_3_eagle.py @@ -18,15 +18,10 @@ from vllm.model_executor.models.deepseek_v2 import ( DeepseekV2DecoderLayer, DeepseekV2Model, ) -from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.mistral_large_3 import MistralLarge3ForCausalLM -from vllm.multimodal.inputs import NestedTensors -from .utils import ( - _merge_multimodal_embeddings, - make_empty_intermediate_tensors_factory, - maybe_prefix, -) +from .interfaces import SupportsMultiModal +from .utils import make_empty_intermediate_tensors_factory, maybe_prefix logger = init_logger(__name__) @@ -117,26 +112,10 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM): ) super().__init__(vllm_config=vllm_config, prefix=prefix) - def get_input_embeddings( - self, - input_ids: torch.Tensor, - multimodal_embeddings: MultiModalEmbeddings | None = None, - *, - is_multimodal: torch.Tensor | None = None, - handle_oov_mm_token: bool = False, - ) -> torch.Tensor: - inputs_embeds = super().embed_input_ids(input_ids) + def get_language_model(self) -> torch.nn.Module: + return self.model - if multimodal_embeddings is None or len(multimodal_embeddings) == 0: - return inputs_embeds - - assert is_multimodal is not None - - return _merge_multimodal_embeddings( - inputs_embeds=inputs_embeds, - multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, - ) + embed_input_ids = SupportsMultiModal.embed_input_ids # type: ignore def forward( self, @@ -155,11 +134,3 @@ class EagleMistralLarge3ForCausalLM(MistralLarge3ForCausalLM): "model.embed_tokens.weight", "lm_head.weight", } - - def embed_input_ids( - self, - input_ids: torch.Tensor, - multimodal_embeddings: NestedTensors | None = None, - is_multimodal: torch.Tensor | None = None, - ) -> torch.Tensor: - return self.model.embed_input_ids(input_ids) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index b7ae548069f25..900b0eade308c 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -64,6 +64,7 @@ from .interfaces import ( SupportsMultiModal, SupportsPP, SupportsQuant, + _require_is_multimodal, ) from .utils import ( AutoWeightsLoader, @@ -687,17 +688,10 @@ class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant) if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - if is_multimodal is None: - raise ValueError( - "`embed_input_ids` now requires `is_multimodal` arg, " - "please update your model runner according to " - "https://github.com/vllm-project/vllm/pull/16229." - ) - return _merge_multimodal_embeddings( inputs_embeds=inputs_embeds, multimodal_embeddings=multimodal_embeddings, - is_multimodal=is_multimodal, + is_multimodal=_require_is_multimodal(is_multimodal), ) def forward( diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py index 5a428740082f6..cbc618f1abd08 100644 --- a/vllm/model_executor/models/qwen2_moe.py +++ b/vllm/model_executor/models/qwen2_moe.py @@ -367,6 +367,8 @@ class Qwen2MoeModel(nn.Module): self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, + quant_config=quant_config, + prefix=f"{prefix}.embed_tokens", ) self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, @@ -512,6 +514,12 @@ class Qwen2MoeModel(nn.Module): continue else: name = remapped_kv_scale_name + # GGUF: make sure that shared_expert_gate is a 2D tensor. + if ( + "mlp.shared_expert_gate" in name + and len(loaded_weight.shape) == 1 + ): + loaded_weight = loaded_weight[None, :] param = params_dict[name] weight_loader = getattr( param, "weight_loader", default_weight_loader diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 6f520706a3176..c6984dc37c51c 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -403,6 +403,7 @@ class Qwen3MoeModel(nn.Module): self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size self.config = config + self.quant_config = quant_config self.embed_tokens = VocabParallelEmbedding( config.vocab_size, config.hidden_size, @@ -505,6 +506,19 @@ class Qwen3MoeModel(nn.Module): loaded_params: set[str] = set() expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: + if self.quant_config is not None and ( + scale_name := self.quant_config.get_cache_scale(name) + ): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + assert loaded_weight.numel() == 1, ( + f"KV scale numel {loaded_weight.numel()} != 1" + ) + loaded_weight = loaded_weight.squeeze() + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for param_name, weight_name, shard_id in stacked_params_mapping: # Skip non-stacked layers and experts (experts handled below). if weight_name not in name: diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 1add39d6b0a84..f8e0ea6284994 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -93,6 +93,7 @@ from .interfaces import ( SupportsMRoPE, SupportsMultiModal, SupportsPP, + _require_is_multimodal, ) from .qwen2_5_vl import ( Qwen2_5_VisionAttention, @@ -1572,12 +1573,7 @@ class Qwen3VLForConditionalGeneration( if multimodal_embeddings is None or len(multimodal_embeddings) == 0: return inputs_embeds - if is_multimodal is None: - raise ValueError( - "`embed_input_ids` now requires `is_multimodal` arg, " - "please update your model runner according to " - "https://github.com/vllm-project/vllm/pull/16229." - ) + is_multimodal = _require_is_multimodal(is_multimodal) if self.use_deepstack: ( diff --git a/vllm/model_executor/models/transformers/base.py b/vllm/model_executor/models/transformers/base.py index f3ebc6da8e302..45e746ac2d356 100644 --- a/vllm/model_executor/models/transformers/base.py +++ b/vllm/model_executor/models/transformers/base.py @@ -36,6 +36,8 @@ from vllm.distributed.utils import get_pp_indices from vllm.logger import init_logger from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding from vllm.model_executor.models.interfaces import ( + SupportsEagle, + SupportsEagle3, SupportsLoRA, SupportsPP, SupportsQuant, @@ -92,7 +94,15 @@ def vllm_flash_attention_forward( ALL_ATTENTION_FUNCTIONS["vllm"] = vllm_flash_attention_forward -class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): +class Base( + nn.Module, + VllmModel, + SupportsQuant, + SupportsLoRA, + SupportsPP, + SupportsEagle, + SupportsEagle3, +): embedding_modules = ["embed_tokens"] # TODO transformers will have a util to get it hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ @@ -131,17 +141,24 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): self.pp_group = get_pp_group() self.tp_group = get_tp_group() - # Weights to skip in `self.load_weights` + # Attrs for weight loading (see self.load_weights) self.skip_prefixes: list[str] = [] """Skip loading weights whose qualname starts with these prefixes.""" self.skip_substrs: list[str] = [] """Skip loading weights whose qualname contains these substrings.""" self.ignore_unexpected_prefixes: list[str] = [] - """Ignore unexpected weights whose qualname starts with these prefixes. - """ + """Ignore unexpected weights whose qualname starts with these prefixes.""" self.ignore_unexpected_suffixes: list[str] = [] """Ignore unexpected weights whose qualname ends with these suffixes.""" + # Attrs for Eagle3 (see self.set_aux_hidden_state_layers) + self._target_class: type[nn.Module] = nn.Module + """Target class for Eagle3 aux hidden state recording.""" + self._layer_names: dict[int, str] = {} + """Mapping from layer index to layer name for Eagle3.""" + self._output_aux_hidden_states_kwargs: dict[str, bool] = {} + """Kwargs to pass to model forward for Eagle3 aux hidden states.""" + if self.quant_config: quant_method_name = self.quant_config.get_name() # Check for unsupported quantization methods. @@ -278,6 +295,15 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): for child_name, child_module in module.named_children(): new_module = child_module qual_name = maybe_prefix(prefix, child_name) + # Populate Eagle3 attrs + if ( + isinstance(module, nn.ModuleList) + and len(module) == self.text_config.num_hidden_layers + ): + self._target_class = type(child_module) + layer_name = qual_name.removeprefix("model.") + self._layer_names[int(child_name)] = layer_name + # Replace modules as needed if isinstance(child_module, nn.Linear): generator = (p for p in tp_plan if re.match(p, qual_name)) pattern = next(generator, None) @@ -425,19 +451,26 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): else: position_ids = positions[None, ...] - hidden_states = self.model( + outputs = self.model( input_ids=input_ids, inputs_embeds=inputs_embeds, use_cache=False, position_ids=position_ids, attention_instances=self.attention_instances, return_dict=False, + **self._output_aux_hidden_states_kwargs, **kwargs, - )[0][0, ...] # we remove batch dimension for now + ) + # We must remove the batch dimension from these outputs + hidden_states = outputs[0][0, ...] + if self._output_aux_hidden_states_kwargs: + aux_hidden_states = [x[0][0, ...] for x in outputs[1:]] if not self.pp_group.is_last_rank: return IntermediateTensors({"hidden_states": hidden_states}) + if self._output_aux_hidden_states_kwargs and len(aux_hidden_states) > 0: + return hidden_states, aux_hidden_states return hidden_states def load_weights( @@ -462,3 +495,24 @@ class Base(nn.Module, VllmModel, SupportsQuant, SupportsLoRA, SupportsPP): f"Transformers modeling backend requires transformers>={required} " f"for {feature}, but got {installed}" ) + + def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None: + self.check_version("5.0.0.dev0", "Eagle3 support") + from transformers.utils.generic import OutputRecorder + + # The default value in PreTrainedModel is None + if self.model._can_record_outputs is None: + self.model._can_record_outputs = {} + + target_class = self._target_class + for layer in layers: + # layer - 1 because we want the input to the layer + layer_name = self._layer_names[layer - 1] + layer_key = f"aux_hidden_state_{layer}" + aux_hidden_state_i = OutputRecorder(target_class, layer_name=layer_name) + self.model._can_record_outputs[layer_key] = aux_hidden_state_i + self._output_aux_hidden_states_kwargs[f"output_{layer_key}"] = True + + def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]: + num_layers = self.text_config.num_hidden_layers + return (2, num_layers // 2, num_layers - 3) diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index 8aad59e84ff25..b89371d987541 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -50,6 +50,31 @@ def set_weight_attrs( setattr(weight, key, value) +def replace_parameter(layer: torch.nn.Module, param_name: str, new_data: torch.Tensor): + """ + Replace a parameter of a layer while maintaining the ability to reload the weight. + Called within implementations of the `process_weights_after_loading` method. + + This function should not be called on weights which are tied/shared + + Args: + layer: Layer containing parameter to replace + param_name: Name of parameter to replace + new_data: New data of the new parameter + """ + # should not be used on a tied/shared param + if isinstance(new_data, torch.nn.Parameter): + new_data = new_data.data + new_param = torch.nn.Parameter(new_data, requires_grad=False) + + old_param: torch.nn.Parameter | None = getattr(layer, param_name, None) + if old_param is not None and hasattr(old_param, "weight_loader"): + weight_loader = old_param.weight_loader + set_weight_attrs(new_param, {"weight_loader": weight_loader}) + + setattr(layer, param_name, new_param) + + def get_packed_modules_mapping(model: torch.nn.Module) -> dict[str, list[str]]: parent_map = getattr(model, "packed_modules_mapping", None) parent_map = copy.deepcopy(parent_map) if parent_map is not None else {} diff --git a/vllm/model_executor/warmup/deep_gemm_warmup.py b/vllm/model_executor/warmup/deep_gemm_warmup.py index e0c584df8760b..936f6b1e28ce1 100644 --- a/vllm/model_executor/warmup/deep_gemm_warmup.py +++ b/vllm/model_executor/warmup/deep_gemm_warmup.py @@ -89,7 +89,7 @@ def _extract_data_from_linear_base_module( assert m.quant_method.quant_config is not None w = m.weight - ws = m.weight_scale + ws = m.weight_scale_inv if hasattr(m, "weight_scale_inv") else m.weight_scale quant_block_size = m.quant_method.quant_config.weight_block_size assert isinstance(w, torch.Tensor) diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index 2ed66554e358e..6b1cbbe24e2e7 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -954,7 +954,7 @@ MultiModalKwargsOptionalItems: TypeAlias = ( ) -@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.13.") +@deprecated("`MultiModalKwargs` is deprecated and will be removed in v0.14.") class MultiModalKwargs(UserDict[str, NestedTensors]): """ A dictionary that represents the keyword arguments to @@ -964,7 +964,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod @deprecated( "`MultiModalKwargs.from_hf_inputs` is deprecated and " - "will be removed in v0.13. " + "will be removed in v0.14. " "Please use `MultiModalKwargsItems.from_hf_inputs` and " "access the tensor data using `.get_data()`." ) @@ -977,7 +977,7 @@ class MultiModalKwargs(UserDict[str, NestedTensors]): @staticmethod @deprecated( "`MultiModalKwargs.from_items` is deprecated and " - "will be removed in v0.13. " + "will be removed in v0.14. " "Please use `MultiModalKwargsItems.from_seq` and " "access the tensor data using `.get_data()`." ) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index d4bdc55e569b2..7fd05af583b0a 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -429,12 +429,12 @@ def group_mm_kwargs_by_modality( if merge_by_field_config is not None: logger.warning_once( "The `merge_by_field_config` argument of `group_mm_kwargs_by_modality` " - "is deprecated and will be removed in v0.13." + "is deprecated and will be removed in v0.14." ) if multimodal_cpu_fields is not None: logger.warning_once( "The `multimodal_cpu_fields` argument of `group_mm_kwargs_by_modality` " - "is deprecated and will be removed in v0.13." + "is deprecated and will be removed in v0.14." ) from vllm.multimodal.inputs import MultiModalKwargsItems diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index abfc226a689c2..024252799cf74 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -283,8 +283,15 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]): # They can be passed to the underlying # media loaders (e.g. custom implementations) # for flexible control. + + # Allow per-request override of video backend via kwargs. + # This enables users to specify a different backend than the + # global VLLM_VIDEO_LOADER_BACKEND env var, e.g.: + # --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}' + video_loader_backend = ( + kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND + ) self.kwargs = kwargs - video_loader_backend = envs.VLLM_VIDEO_LOADER_BACKEND self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend) def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]: diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index ff0fc78517876..876114c2d33a4 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -381,6 +381,8 @@ class RocmPlatform(Platform): compilation_config = vllm_config.compilation_config parallel_config = vllm_config.parallel_config is_eager_execution = compilation_config == CUDAGraphMode.NONE + use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() + use_aiter_fp8_linear = rocm_aiter_ops.is_linear_fp8_enaled() if compilation_config.cudagraph_mode.has_full_cudagraphs(): # decode context parallel does not support full cudagraphs @@ -400,10 +402,22 @@ class RocmPlatform(Platform): ) compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE - use_aiter_rms_norm = rocm_aiter_ops.is_rmsnorm_enabled() - if cache_config and cache_config.block_size is None: - cache_config.block_size = 16 + if ( + envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION and envs.VLLM_ROCM_USE_AITER + # NOTE: This block has been deprecated + # or get_env_variable_attn_backend() + # == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN + # TODO: monitor https://github.com/vllm-project/vllm/pull/30396 + # to see how we can transition to the new way of selecting + # attention backends + ): + cache_config.block_size = 64 + logger.warning( + "[ROCM_AITER_UNIFIED_ATTN]: Setting kv cache block size to 64." + ) + else: + cache_config.block_size = 16 if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" @@ -415,6 +429,9 @@ class RocmPlatform(Platform): ): compilation_config.custom_ops.append("+rms_norm") + if use_aiter_fp8_linear and "-quant_fp8" not in compilation_config.custom_ops: + compilation_config.custom_ops.append("+quant_fp8") + @classmethod def verify_model_arch(cls, model_arch: str) -> None: if model_arch in _ROCM_UNSUPPORTED_MODELS: diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/wrapper.py similarity index 73% rename from vllm/profiler/gpu_profiler.py rename to vllm/profiler/wrapper.py index 798c615221b9f..a44a6a5eea0dd 100644 --- a/vllm/profiler/gpu_profiler.py +++ b/vllm/profiler/wrapper.py @@ -3,26 +3,27 @@ from abc import ABC, abstractmethod from contextlib import nullcontext +from typing import Literal import torch from typing_extensions import override -import vllm.envs as envs +from vllm.config import ProfilerConfig from vllm.logger import init_logger logger = init_logger(__name__) class WorkerProfiler(ABC): - def __init__(self) -> None: - self._delay_iters = envs.VLLM_PROFILER_DELAY_ITERS + def __init__(self, profiler_config: ProfilerConfig) -> None: + self._delay_iters = profiler_config.delay_iterations if self._delay_iters > 0: logger.info_once( "GPU profiling will start " f"{self._delay_iters} steps after start_profile." ) - self._max_iters = envs.VLLM_PROFILER_MAX_ITERS + self._max_iters = profiler_config.max_iterations if self._max_iters > 0: logger.info_once( "GPU profiling will stop " @@ -133,12 +134,27 @@ class WorkerProfiler(ABC): return nullcontext() +TorchProfilerActivity = Literal["CPU", "CUDA", "XPU"] +TorchProfilerActivityMap = { + "CPU": torch.profiler.ProfilerActivity.CPU, + "CUDA": torch.profiler.ProfilerActivity.CUDA, + "XPU": torch.profiler.ProfilerActivity.XPU, +} + + class TorchProfilerWrapper(WorkerProfiler): - def __init__(self, worker_name: str, local_rank: int) -> None: - super().__init__() + def __init__( + self, + profiler_config: ProfilerConfig, + worker_name: str, + local_rank: int, + activities: list[TorchProfilerActivity], + ) -> None: + super().__init__(profiler_config) self.local_rank = local_rank - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + self.profiler_config = profiler_config + torch_profiler_trace_dir = profiler_config.torch_profiler_dir if local_rank in (None, 0): logger.info( "Torch profiling enabled. Traces will be saved to: %s", @@ -147,24 +163,23 @@ class TorchProfilerWrapper(WorkerProfiler): logger.debug( "Profiler config: record_shapes=%s," "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + profiler_config.torch_profiler_record_shapes, + profiler_config.torch_profiler_with_memory, + profiler_config.torch_profiler_with_stack, + profiler_config.torch_profiler_with_flops, ) + + self.dump_cpu_time_total = "CPU" in activities and len(activities) == 1 self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.CUDA, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + activities=[TorchProfilerActivityMap[activity] for activity in activities], + record_shapes=profiler_config.torch_profiler_record_shapes, + profile_memory=profiler_config.torch_profiler_with_memory, + with_stack=profiler_config.torch_profiler_with_stack, + with_flops=profiler_config.torch_profiler_with_flops, on_trace_ready=torch.profiler.tensorboard_trace_handler( torch_profiler_trace_dir, worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, + use_gzip=profiler_config.torch_profiler_use_gzip, ), ) @@ -176,9 +191,10 @@ class TorchProfilerWrapper(WorkerProfiler): def _stop(self) -> None: self.profiler.stop() - if envs.VLLM_TORCH_PROFILER_DUMP_CUDA_TIME_TOTAL: - rank = self.local_rank - profiler_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = self.profiler_config + rank = self.local_rank + if profiler_config.torch_profiler_dump_cuda_time_total: + profiler_dir = profiler_config.torch_profiler_dir profiler_out_file = f"{profiler_dir}/profiler_out_{rank}.txt" sort_key = "self_cuda_time_total" table = self.profiler.key_averages().table(sort_by=sort_key) @@ -189,6 +205,12 @@ class TorchProfilerWrapper(WorkerProfiler): # only print profiler results on rank 0 if rank == 0: print(table) + if self.dump_cpu_time_total and rank == 0: + logger.info( + self.profiler.key_averages().table( + sort_by="self_cpu_time_total", row_limit=50 + ) + ) @override def annotate_context_manager(self, name: str): @@ -196,8 +218,8 @@ class TorchProfilerWrapper(WorkerProfiler): class CudaProfilerWrapper(WorkerProfiler): - def __init__(self) -> None: - super().__init__() + def __init__(self, profiler_config: ProfilerConfig) -> None: + super().__init__(profiler_config) # Note: lazy import to avoid dependency issues if CUDA is not available. import torch.cuda.profiler as cuda_profiler diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py index d0661d1f23b06..bf593ca4e52a0 100644 --- a/vllm/reasoning/abs_reasoning_parsers.py +++ b/vllm/reasoning/abs_reasoning_parsers.py @@ -63,6 +63,31 @@ class ReasoningParser: True if the reasoning content ends in the input_ids. """ + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + """ + Check if the reasoning content ends in the input_ids on a + decode step. + + It is used in structured engines like `xgrammar` to check if the + reasoning content ends in the model output during a decode step. + `input_ids` the entire model output and `delta_ids` are the last few + computed tokens of the model output (like during a decode step). + + Parameters: + input_ids: list[int] + The entire model output. + delta_ids: list[int] + The last few computed tokens of the model output at the current decode step. + + Returns: + bool + True if the reasoning content ends in the `delta_ids` on a + decode step. + """ + return self.is_reasoning_end(input_ids) + @abstractmethod def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py index e78ac4a5ebb37..43067ca4afe05 100644 --- a/vllm/reasoning/basic_parsers.py +++ b/vllm/reasoning/basic_parsers.py @@ -74,6 +74,12 @@ class BaseThinkingReasoningParser(ReasoningParser): return True return False + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + end_token_id = self.end_token_id + return end_token_id in delta_ids + def extract_content_ids(self, input_ids: list[int]) -> list[int]: """ Extract the content after the end tokens diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index afdf73262aca0..6604f70badbcf 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -35,6 +35,11 @@ class DeepSeekV3ReasoningParser(ReasoningParser): def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self._parser.is_reasoning_end(input_ids) + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) + def extract_content_ids(self, input_ids: list[int]) -> list[int]: return self._parser.extract_content_ids(input_ids) diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py index 76de1c077c88b..f80190d28d6aa 100644 --- a/vllm/reasoning/holo2_reasoning_parser.py +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -56,6 +56,11 @@ class Holo2ReasoningParser(ReasoningParser): def is_reasoning_end(self, input_ids: Sequence[int]) -> bool: return self._parser.is_reasoning_end(input_ids) + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return self._parser.is_reasoning_end_streaming(input_ids, delta_ids) + def extract_content_ids(self, input_ids: list[int]) -> list[int]: return self._parser.extract_content_ids(input_ids) diff --git a/vllm/reasoning/identity_reasoning_parser.py b/vllm/reasoning/identity_reasoning_parser.py index e92f8add0391a..e998e071efcf6 100644 --- a/vllm/reasoning/identity_reasoning_parser.py +++ b/vllm/reasoning/identity_reasoning_parser.py @@ -32,6 +32,11 @@ class IdentityReasoningParser(ReasoningParser): # Always return True, since we never treat reasoning specially return True + def is_reasoning_end_streaming( + self, input_ids: list[int], delta_ids: list[int] + ) -> bool: + return True + def extract_content_ids(self, input_ids: list[int]) -> list[int]: # Identity: return all tokens as content return input_ids diff --git a/vllm/reasoning/minimax_m2_reasoning_parser.py b/vllm/reasoning/minimax_m2_reasoning_parser.py index 138d1b4e6dacf..a2b9224cb3bff 100644 --- a/vllm/reasoning/minimax_m2_reasoning_parser.py +++ b/vllm/reasoning/minimax_m2_reasoning_parser.py @@ -19,6 +19,10 @@ logger = init_logger(__name__) class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for MiniMax M2 model. + + MiniMax M2 models don't generate start token, only end + token. All content before is reasoning, content after is the + actual response. """ @property @@ -31,6 +35,45 @@ class MiniMaxM2ReasoningParser(BaseThinkingReasoningParser): """The token that ends reasoning content.""" return "" + def extract_reasoning_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> DeltaMessage | None: + """ + Extract reasoning content from a delta message for streaming. + + MiniMax M2 models don't generate start token, so we assume + all content is reasoning until we encounter the end token. + """ + # Skip single end token + if len(delta_token_ids) == 1 and delta_token_ids[0] == self.end_token_id: + return None + + # Check if end token has already appeared in previous tokens + # meaning we're past the reasoning phase + if self.end_token_id in previous_token_ids: + # We're past the reasoning phase, this is content + return DeltaMessage(content=delta_text) + + # Check if end token is in delta tokens + if self.end_token_id in delta_token_ids: + # End token in delta, split reasoning and content + end_index = delta_text.find(self.end_token) + reasoning = delta_text[:end_index] + content = delta_text[end_index + len(self.end_token) :] + return DeltaMessage( + reasoning=reasoning if reasoning else None, + content=content if content else None, + ) + + # No end token yet, all content is reasoning + return DeltaMessage(reasoning=delta_text) + class MiniMaxM2AppendThinkReasoningParser(ReasoningParser): """ diff --git a/vllm/reasoning/mistral_reasoning_parser.py b/vllm/reasoning/mistral_reasoning_parser.py index b61e50c188f8c..3206dbb29fe2e 100644 --- a/vllm/reasoning/mistral_reasoning_parser.py +++ b/vllm/reasoning/mistral_reasoning_parser.py @@ -3,20 +3,29 @@ from functools import cached_property +from vllm.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ResponsesRequest, +) from vllm.logger import init_logger from vllm.reasoning import ReasoningParser -from vllm.reasoning.deepseek_r1_reasoning_parser import DeepSeekR1ReasoningParser +from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser from vllm.tokenizers import MistralTokenizer logger = init_logger(__name__) -class MistralReasoningParser(DeepSeekR1ReasoningParser): +class MistralReasoningParser(BaseThinkingReasoningParser): """ Reasoning parser for Mistral models. - The Mistral models uses [THINK]...[/THINK] tokens to denote reasoning + The Mistral models uses `[THINK]`...`[/THINK]` tokens to denote reasoning text. This parser extracts the reasoning content from the model output. + + A valid reasoning trace should always start with a `[THINK]` token and end with + a `[/THINK]` token. + + If `[THINK]` token is not generated, then this parser only returns content. """ def __init__(self, tokenizer: MistralTokenizer, *args, **kwargs): @@ -53,3 +62,93 @@ class MistralReasoningParser(DeepSeekR1ReasoningParser): from mistral_common.tokens.tokenizers.base import SpecialTokens return SpecialTokens.end_think + + def is_reasoning_end(self, input_ids: list[int]) -> bool: + has_eot_token = False + + for id in input_ids[::-1]: + if id == self.start_token_id: + # Reasoning ends only if a BOT token is found before a EOT token. + return has_eot_token + elif id == self.end_token_id: + has_eot_token = True + return False + + def extract_content_ids(self, input_ids: list[int]) -> list[int]: + """ + Extract the content + """ + has_bot_token = False + has_eot_token = False + bot_token_index = -1 + eot_token_index = -1 + # One for loop instead of multiple lookups + for i, token_id in enumerate(input_ids): + # We filter that we have multiple BOT tokens which should not + # happen for a well prompted trained model + if token_id == self.start_token_id and not has_bot_token: + has_bot_token = True + bot_token_index = i + elif token_id == self.end_token_id: + has_eot_token = True + eot_token_index = i + break + + # 1. Only BOT has been outputted + if has_bot_token and not has_eot_token: + # Should be = [] if model is well prompted and trained. + return input_ids[:bot_token_index] + # 2. Neither BOT or EOT have been outputted + elif not has_bot_token and not has_eot_token: + return input_ids + # 3. Both BOT and EOT have been outputted. + elif has_bot_token and has_eot_token: + return input_ids[:bot_token_index] + input_ids[eot_token_index + 1 :] + # 4. Only EOT has been outputted => this should not have occured for a model + # well prompted and trained. + else: + return input_ids[:eot_token_index] + input_ids[eot_token_index + 1 :] + + def extract_reasoning( + self, model_output: str, request: ChatCompletionRequest | ResponsesRequest + ) -> tuple[str | None, str | None]: + """ + Extract reasoning content from the model output. + """ + if not model_output: + return (None, "") + + # Check if the start token is present in the model output, remove it + # if it is present. + prev_bot_token, bot_token, post_bot_token = model_output.partition( + self.start_token + ) + + has_bot_token = bool(bot_token) + # Valid EOT tokens should follow BOT token + has_valid_eot_token = has_bot_token and self.end_token in post_bot_token + + # 1. If there is BOT token followed by EOT token + if has_bot_token and has_valid_eot_token: + prev_eot_token, _, post_eot_token = post_bot_token.partition(self.end_token) + # If model is well prompted and trained prev_bot_token should be "" + content = prev_bot_token + post_eot_token + return prev_eot_token, content if content else None + # 2. Only BOT token + elif has_bot_token: + # If model is well prompted and trained prev_bot_token should be "" + return post_bot_token, prev_bot_token if prev_bot_token else None + # 3. EOT token has been outputted without BOT or neither has been outputted + else: + has_non_valid_eot_token = self.end_token in prev_bot_token + # 3.a EOT token has been outputted without BOT + # If model is well prompted and trained `has_non_valid_eot_token` should + # be `False` and the parser outputs all tokens as 'content' + if has_non_valid_eot_token: + prev_eot_token, _, post_eot_token = prev_bot_token.partition( + self.end_token + ) + return None, prev_eot_token + post_eot_token + # 3.b neither BOT or EOT have been outputted + else: + return None, prev_bot_token diff --git a/vllm/tokenizers/deepseekv32.py b/vllm/tokenizers/deepseekv32.py index b0490dacbe2d4..5c4936b5e7ad3 100644 --- a/vllm/tokenizers/deepseekv32.py +++ b/vllm/tokenizers/deepseekv32.py @@ -17,6 +17,8 @@ class DeepseekV32Tokenizer(HfTokenizer): self.name_or_path = ( tokenizer.name_or_path if hasattr(tokenizer, "name_or_path") else "" ) + self._added_vocab = self.tokenizer.get_added_vocab() + self._added_vocab_size = len(self._added_vocab) @classmethod def from_pretrained( @@ -98,7 +100,7 @@ class DeepseekV32Tokenizer(HfTokenizer): def __len__(self) -> int: # is an added token in DeepseekV32 tokenizer - return self.vocab_size + len(self.get_added_vocab()) + return self.vocab_size + self._added_vocab_size def __call__( self, @@ -120,7 +122,7 @@ class DeepseekV32Tokenizer(HfTokenizer): return self.tokenizer.get_vocab() def get_added_vocab(self) -> dict[str, int]: - return self.tokenizer.get_added_vocab() + return self._added_vocab.copy() def encode( self, diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 773fc05a52ef3..d761802da9403 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -954,6 +954,13 @@ def try_get_generation_config( revision: str | None = None, config_format: str | ConfigFormat = "auto", ) -> GenerationConfig | None: + # GGUF files don't have generation_config.json - their config is embedded + # in the file header. Skip all filesystem lookups to avoid re-reading the + # memory-mapped file, which can hang in multi-process scenarios when the + # EngineCore process already has the file mapped. + if is_gguf(model): + return None + try: return GenerationConfig.from_pretrained( model, diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py index 32999903b3480..8745e1d9dbbbc 100644 --- a/vllm/transformers_utils/tokenizer.py +++ b/vllm/transformers_utils/tokenizer.py @@ -17,7 +17,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.AnyTokenizer` has been moved to " "`vllm.tokenizers.TokenizerLike`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -29,7 +29,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.get_tokenizer` " "has been moved to `vllm.tokenizers.get_tokenizer`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -41,7 +41,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.cached_get_tokenizer` " "has been moved to `vllm.tokenizers.cached_get_tokenizer`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -53,7 +53,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.cached_tokenizer_from_config` " "has been moved to `vllm.tokenizers.cached_tokenizer_from_config`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -65,7 +65,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer.init_tokenizer_from_configs` " "has been moved to `vllm.tokenizers.init_tokenizer_from_config`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -75,7 +75,7 @@ def __getattr__(name: str): raise AttributeError(f"module {__name__!r} has no attribute {name!r}") -@deprecated("Will be removed in v0.13. Please use `tokenizer.decode()` instead.") +@deprecated("Will be removed in v0.14. Please use `tokenizer.decode()` instead.") def decode_tokens( tokenizer: TokenizerLike, token_ids: list[int], @@ -97,7 +97,7 @@ def decode_tokens( return tokenizer.decode(token_ids, **kw_args) -@deprecated("Will be removed in v0.13. Please use `tokenizer.encode()` instead.") +@deprecated("Will be removed in v0.14. Please use `tokenizer.encode()` instead.") def encode_tokens( tokenizer: TokenizerLike, text: str, diff --git a/vllm/transformers_utils/tokenizer_base.py b/vllm/transformers_utils/tokenizer_base.py index 78fb6edc8b9ed..3dfd4b4f2f6c1 100644 --- a/vllm/transformers_utils/tokenizer_base.py +++ b/vllm/transformers_utils/tokenizer_base.py @@ -11,7 +11,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer_base.TokenizerBase` has been " "moved to `vllm.tokenizers.TokenizerLike`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) @@ -23,7 +23,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.transformers_utils.tokenizer_base.TokenizerRegistry` has been " "moved to `vllm.tokenizers.TokenizerRegistry`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) diff --git a/vllm/utils/deep_gemm.py b/vllm/utils/deep_gemm.py index 8545108a02666..a099fde1bdc45 100644 --- a/vllm/utils/deep_gemm.py +++ b/vllm/utils/deep_gemm.py @@ -399,6 +399,7 @@ def should_use_deepgemm_for_fp8_linear_for_nk( __all__ = [ "calc_diff", + "DeepGemmQuantScaleFMT", "fp8_gemm_nt", "m_grouped_fp8_gemm_nt_contiguous", "fp8_m_grouped_gemm_nt_masked", diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py index 7aaf690cbaa13..9a66049350cd8 100644 --- a/vllm/utils/flashinfer.py +++ b/vllm/utils/flashinfer.py @@ -269,6 +269,8 @@ def supports_trtllm_attention() -> bool: def force_use_trtllm_attention() -> bool | None: """ + This function should only be called during initialization stage when vllm config + is set. Return `None` if --attention-config.use_trtllm_attention is not set, return `True` if TRTLLM attention is forced to be used, return `False` if TRTLLM attention is forced to be not used. @@ -296,11 +298,12 @@ def use_trtllm_attention( kv_cache_dtype: str, q_dtype: torch.dtype, is_prefill: bool, + # None means auto-detection, True means force on, False means force off + force_use_trtllm: bool | None = None, has_sinks: bool = False, has_spec: bool = False, ) -> bool: """Return `True` if TRTLLM attention is used.""" - force_use_trtllm = force_use_trtllm_attention() # CLI argument is set to 0 - respect it if force_use_trtllm is not None and not force_use_trtllm: diff --git a/vllm/v1/attention/backends/cpu_attn.py b/vllm/v1/attention/backends/cpu_attn.py index fed7dcdf293bd..394d0c2f67136 100644 --- a/vllm/v1/attention/backends/cpu_attn.py +++ b/vllm/v1/attention/backends/cpu_attn.py @@ -21,7 +21,7 @@ from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, split_decodes_and_prefills, ) -from vllm.v1.kv_cache_interface import AttentionSpec +from vllm.v1.kv_cache_interface import AttentionSpec, CrossAttentionSpec logger = init_logger(__name__) @@ -50,11 +50,13 @@ class CPUAttentionBackend(AttentionBackend): @classmethod def supports_attn_type(cls, attn_type: str) -> bool: - """CPU attention supports decoder and encoder-only attention.""" + """CPU attention supports decoder, + encoder-only and encoder-decoder attention.""" return attn_type in ( AttentionType.DECODER, AttentionType.ENCODER, AttentionType.ENCODER_ONLY, + AttentionType.ENCODER_DECODER, ) @staticmethod @@ -136,6 +138,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] self.window_size = -1 self.block_size = vllm_config.cache_config.block_size self.isa = _get_attn_isa(self.dtype, self.block_size) + self.is_cross_attention = isinstance(kv_cache_spec, CrossAttentionSpec) def build( self, @@ -151,7 +154,7 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] seq_lens = common_attn_metadata.seq_lens block_table_tensor = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping - causal = common_attn_metadata.causal + causal = False if self.is_cross_attention else common_attn_metadata.causal sdpa_start_loc = query_start_loc num_decode_tokens = 0 @@ -171,22 +174,19 @@ class CPUAttentionMetadataBuilder(AttentionMetadataBuilder[CPUAttentionMetadata] query_start_loc = query_start_loc[: num_decodes + 1] block_table_tensor = block_table_tensor[:num_decodes] - sheduler_metadata = None - if causal: - # for decode batch, use the custom kernel - sheduler_metadata = ops.cpu_attn_get_scheduler_metadata( - num_reqs=num_reqs, - num_heads=self.num_heads, - num_kv_heads=self.num_kv_heads, - head_dim=self.head_dim, - seq_lens=seq_lens, - dtype=self.dtype, - query_start_loc=query_start_loc, - causal=causal, - sliding_window_size=self.window_size, - isa=self.isa, - enable_kv_split=True, - ) + sheduler_metadata = ops.cpu_attn_get_scheduler_metadata( + num_reqs=num_reqs, + num_heads=self.num_heads, + num_kv_heads=self.num_kv_heads, + head_dim=self.head_dim, + seq_lens=seq_lens, + dtype=self.dtype, + query_start_loc=query_start_loc, + causal=causal, + sliding_window_size=self.window_size, + isa=self.isa, + enable_kv_split=True, + ) attn_metadata = CPUAttentionMetadata( isa=self.isa, diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 8e9d764e4a123..4174b80ee312e 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -429,6 +429,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): super().__init__(kv_cache_spec, layer_names, vllm_config, device) self.cache_config = vllm_config.cache_config self.model_config = vllm_config.model_config + self.attention_config = vllm_config.attention_config self._workspace_buffer = None self._prefill_wrapper: ( BatchPrefillWithPagedKVCacheWrapper | BatchDCPPrefillWrapper | None @@ -779,6 +780,7 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.cache_dtype, self.q_data_type, is_prefill=True, + force_use_trtllm=self.attention_config.use_trtllm_attention, has_sinks=self.has_sinks, has_spec=uses_spec_reorder, ) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index e921f8c3de073..3a2f92d9921c3 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -370,6 +370,6 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] num_accepted_tokens = torch.diff(m.query_start_loc) num_decode_draft_tokens_cpu = (num_accepted_tokens - 1).cpu() - m.num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu() + m._num_computed_tokens_cpu = m.seq_lens_cpu - num_accepted_tokens.cpu() return self.build(0, m, num_accepted_tokens, num_decode_draft_tokens_cpu) diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index 309ddee4fc2f0..8265503c28c35 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -1654,6 +1654,33 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): # Convert from (L, N, P) to (N, P, L) self.W_UK_T = W_UK.permute(1, 2, 0) + def _concat_k_nope_k_pe( + self, k_nope: torch.Tensor, k_pe: torch.Tensor + ) -> torch.Tensor: + """ + Efficiently concatenate k_nope and k_pe tensors along the last dimension. + + This function avoids the performance penalty of torch.cat with expanded + non-contiguous tensors by pre-allocating the output and using direct copies. + + Args: + k_nope: Tensor of shape [..., nope_dim] + k_pe: Tensor to broadcast and concatenate, typically shape [..., 1, pe_dim] + or [..., pe_dim] + + Returns: + Tensor of shape [..., nope_dim + pe_dim] + """ + k = torch.empty( + (*k_nope.shape[:-1], k_nope.shape[-1] + k_pe.shape[-1]), + dtype=k_nope.dtype, + device=k_nope.device, + ) + # Direct copies with efficient broadcasting + k[..., : k_nope.shape[-1]] = k_nope + k[..., k_nope.shape[-1] :] = k_pe + return k + def _compute_prefill_context( self, q: torch.Tensor, @@ -1690,7 +1717,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + k = self._concat_k_nope_k_pe(k_nope, k_pe) attn_output, attn_softmax_lse = self._run_prefill_context_chunk( prefill=prefill_metadata, @@ -1794,7 +1821,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): -1, self.num_heads, self.qk_nope_head_dim + self.v_head_dim ) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + k = self._concat_k_nope_k_pe(k_nope, k_pe) attn_output, attn_softmax_lse = self._run_prefill_context_chunk( prefill=prefill_metadata, @@ -1843,7 +1870,7 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): ) k_nope, v = kv_nope.split([self.qk_nope_head_dim, self.v_head_dim], dim=-1) - k = torch.cat((k_nope, k_pe.expand((*k_nope.shape[:-1], -1))), dim=-1) + k = self._concat_k_nope_k_pe(k_nope, k_pe) output_prefill = self._run_prefill_new_tokens( prefill=attn_metadata.prefill, @@ -2037,21 +2064,30 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]): if fp8_attention: ql_nope_shape = decode_ql_nope.shape - decode_ql_nope, _ = ops.scaled_fp8_quant( - decode_ql_nope.reshape( - [ql_nope_shape[0], ql_nope_shape[1] * ql_nope_shape[2]] - ), - layer._q_scale, - ) - decode_ql_nope = decode_ql_nope.reshape(ql_nope_shape) q_pe_shape = decode_q_pe.shape - decode_q_pe, _ = ops.scaled_fp8_quant( - decode_q_pe.reshape([q_pe_shape[0], q_pe_shape[1] * q_pe_shape[2]]), + assert decode_ql_nope.shape[0] == decode_q_pe.shape[0] + assert decode_ql_nope.shape[1] == decode_q_pe.shape[1] + decode_q_shape = ( + ql_nope_shape[0], + ql_nope_shape[1], + ql_nope_shape[2] + q_pe_shape[2], + ) + # Using empty and copy since torch.cat introduces significant overhead. + decode_q0 = torch.empty( + decode_q_shape, + device=decode_ql_nope.device, + dtype=decode_ql_nope.dtype, + ) + decode_q0[..., : ql_nope_shape[2]].copy_(decode_ql_nope) + decode_q0[..., ql_nope_shape[2] :].copy_(decode_q_pe) + + decode_q, _ = ops.scaled_fp8_quant( + decode_q0.view(decode_q_shape[0], -1), layer._q_scale, ) - decode_q_pe = decode_q_pe.reshape(q_pe_shape) - - decode_q = (decode_ql_nope, decode_q_pe) + decode_q = decode_q.view(decode_q_shape) + else: + decode_q = (decode_ql_nope, decode_q_pe) if self.dcp_world_size > 1: assert not fp8_attention, "DCP not support fp8 kvcache now." # concatenate decode_ql_nope and decode_q_pe -> (B, N, L + P) diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index eccf4ec791095..b28814aceada9 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -105,13 +105,14 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] vllm_config: VllmConfig, device: torch.device, ): + interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size super().__init__( kv_cache_spec, layer_names, vllm_config, device, FlashAttnMLAMetadata, - supports_dcp_with_varlen=True, + supports_dcp_with_varlen=(interleave_size == 1), ) self.max_num_splits = 0 # No upper bound on the number of splits. self.fa_aot_schedule = get_flash_attn_version() == 3 diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 8edfbb5140bc9..79a1f7d4757d9 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -18,7 +18,7 @@ from typing import ( import numpy as np import torch -from typing_extensions import runtime_checkable +from typing_extensions import deprecated, runtime_checkable from vllm.config import VllmConfig, get_layers_from_vllm_config from vllm.utils.math_utils import cdiv @@ -66,11 +66,6 @@ class CommonAttentionMetadata: """(batch_size + 1,), the start location of each request in query Tensor""" seq_lens: torch.Tensor - seq_lens_cpu: torch.Tensor - """(batch_size,), the length of each request including both computed tokens - and newly scheduled tokens""" - - num_computed_tokens_cpu: torch.Tensor """(batch_size,), the number of computed tokens for each request""" num_reqs: int @@ -81,7 +76,7 @@ class CommonAttentionMetadata: max_query_len: int """Longest query in batch""" max_seq_len: int - """Longest context length in batch""" + """Longest context length (may be an upper bound)""" block_table_tensor: torch.Tensor slot_mapping: torch.Tensor @@ -100,6 +95,40 @@ class CommonAttentionMetadata: dcp_local_seq_lens_cpu: torch.Tensor | None = None """Sequence lengths of the local rank in decode context parallelism world""" + # WARNING: Deprecated fields. Will be removed in a future release (v0.14.0) + _seq_lens_cpu: torch.Tensor | None = None + _num_computed_tokens_cpu: torch.Tensor | None = None + + @property + @deprecated( + """ + Prefer using device seq_lens directly to avoid implicit H<>D sync. + If a CPU copy is needed, use `seq_lens.cpu()` instead. + Will be removed in a future release (v0.14.0) + """ + ) + def seq_lens_cpu(self) -> torch.Tensor: + if self._seq_lens_cpu is None: + self._seq_lens_cpu = self.seq_lens.to("cpu") + return self._seq_lens_cpu + + @property + @deprecated( + """ + Prefer using device seq_lens directly to avoid implicit H<>D sync which breaks full + async scheduling. If a CPU copy is needed, it can be derived from + query_start_loc_cpu and seq_lens. + Will be removed in a future release (v0.14.0) + """ + ) + def num_computed_tokens_cpu(self) -> torch.Tensor: + if self._num_computed_tokens_cpu is None: + query_seq_lens = ( + self.query_start_loc_cpu[1:] - self.query_start_loc_cpu[:-1] + ) + self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens + return self._num_computed_tokens_cpu + # TODO(lucas): remove once we have FULL-CG spec-decode support def unpadded( self, num_actual_tokens: int, num_actual_reqs: int @@ -109,8 +138,12 @@ class CommonAttentionMetadata: query_start_loc=self.query_start_loc[: num_actual_reqs + 1], query_start_loc_cpu=self.query_start_loc_cpu[: num_actual_reqs + 1], seq_lens=self.seq_lens[:num_actual_reqs], - seq_lens_cpu=self.seq_lens_cpu[:num_actual_reqs], - num_computed_tokens_cpu=self.num_computed_tokens_cpu[:num_actual_reqs], + _seq_lens_cpu=self._seq_lens_cpu[:num_actual_reqs] + if self._seq_lens_cpu is not None + else None, + _num_computed_tokens_cpu=self._num_computed_tokens_cpu[:num_actual_reqs] + if self._num_computed_tokens_cpu is not None + else None, num_reqs=num_actual_reqs, num_actual_tokens=num_actual_tokens, max_query_len=self.max_query_len, @@ -224,14 +257,14 @@ def _make_metadata_with_slice( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=num_requests, num_actual_tokens=num_actual_tokens, max_query_len=max_query_len, max_seq_len=max_seq_len, block_table_tensor=block_table_tensor, slot_mapping=slot_mapping, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, ) @@ -689,9 +722,7 @@ def make_local_attention_virtual_batches( return CommonAttentionMetadata( query_start_loc_cpu=query_start_loc_cpu, query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True), - seq_lens_cpu=seq_lens_cpu, seq_lens=seq_lens_cpu.to(device=device, non_blocking=True), - num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local), num_reqs=len(seq_lens_cpu), num_actual_tokens=common_attn_metadata.num_actual_tokens, max_query_len=seqlens_q_local.max(), @@ -699,6 +730,8 @@ def make_local_attention_virtual_batches( block_table_tensor=block_table_local, slot_mapping=common_attn_metadata.slot_mapping, causal=True, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local), ) @@ -719,7 +752,6 @@ def make_kv_sharing_fast_prefill_common_attn_metadata( logits_indices = logits_indices_padded[:num_logits_indices] num_reqs = common_attn_metadata.num_reqs query_start_loc = common_attn_metadata.query_start_loc - seq_lens = common_attn_metadata.seq_lens # Example inputs # num_reqs: 3 # generation_indices: [14, 18, 19, 27] @@ -748,9 +780,7 @@ def make_kv_sharing_fast_prefill_common_attn_metadata( common_attn_metadata = CommonAttentionMetadata( query_start_loc=decode_query_start_loc, query_start_loc_cpu=decode_query_start_loc.to("cpu", non_blocking=True), - seq_lens=seq_lens, - seq_lens_cpu=seq_lens.to("cpu", non_blocking=True), - num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu, + seq_lens=common_attn_metadata.seq_lens, num_reqs=num_reqs, num_actual_tokens=total_num_decode_tokens, max_query_len=decode_max_query_len, @@ -758,6 +788,8 @@ def make_kv_sharing_fast_prefill_common_attn_metadata( block_table_tensor=common_attn_metadata.block_table_tensor, slot_mapping=common_attn_metadata.slot_mapping, causal=True, + _seq_lens_cpu=common_attn_metadata._seq_lens_cpu, + _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu, ) return common_attn_metadata @@ -883,11 +915,15 @@ def split_decodes_and_prefills( return 0, num_reqs, 0, num_tokens if require_uniform: + # check if we are in a padded uniform batch; this is used for full-CGs, some + # requests may have a query length of 0 but since they are padding its fine + # to treat them as decodes (ensures num_decodes matches the captured size) + if torch.all((query_lens == query_lens[0]) | (query_lens == 0)): + assert num_reqs * query_lens[0] == num_tokens, "tokens not padded correctly" + return num_reqs, 0, num_tokens, 0 # all decodes is_prefill = query_lens != query_lens[0] else: - # 0-query len indicates a padded request; leave this at the back - # of the batch with the prefills - is_prefill = (query_lens > decode_threshold) | (query_lens == 0) + is_prefill = query_lens > decode_threshold if not torch.any(is_prefill): return num_reqs, 0, num_tokens, 0 diff --git a/vllm/v1/core/block_pool.py b/vllm/v1/core/block_pool.py index cfb2c02e00f1b..c779e3d34b3ed 100644 --- a/vllm/v1/core/block_pool.py +++ b/vllm/v1/core/block_pool.py @@ -397,6 +397,25 @@ class BlockPool: [block for block in blocks_list if block.ref_cnt == 0 and not block.is_null] ) + def evict_blocks(self, block_ids: set[int]) -> None: + """evict blocks from the prefix cache by their block IDs. + + only evicts blocks that are currently cached (have a hash). blocks + with ref_cnt > 0 are not freed from the block pool, only evicted + from the prefix cache hash table. + + Args: + block_ids: Set of block IDs to evict from cache. + """ + for block_id in block_ids: + assert block_id < len(self.blocks), ( + f"Invalid block_id {block_id} >= {len(self.blocks)}. " + f"This indicates a bug in the KV connector - workers should " + f"only report block IDs that were allocated by the scheduler." + ) + block = self.blocks[block_id] + self._maybe_evict_cached_block(block) + def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalid prefix caching after the weights are updated, diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 33e8c81514c5f..13086a66f6ea6 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -333,6 +333,14 @@ class KVCacheManager: """ self.coordinator.free(request.request_id) + def evict_blocks(self, block_ids: set[int]) -> None: + """evict blocks from the prefix cache by their block IDs. + + Args: + block_ids: Set of block IDs to evict from cache. + """ + self.block_pool.evict_blocks(block_ids) + def reset_prefix_cache(self) -> bool: """Reset prefix cache. This function may be used in RLHF flows to invalidate prefix caching after the weights are updated, diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 774200deed158..e4360de3717d1 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -687,7 +687,9 @@ def check_enough_kv_cache_memory( raise ValueError( "No available memory for the cache blocks. " "Try increasing `gpu_memory_utilization` when " - "initializing the engine." + "initializing the engine. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more details." ) max_model_len = vllm_config.model_config.max_model_len @@ -711,8 +713,10 @@ def check_enough_kv_cache_memory( f"cache is needed, which is larger than the available KV cache " f"memory ({available_memory / GiB_bytes:.2f} GiB). " f"{estimated_msg} " - f"Try increasing `gpu_memory_utilization` or decreasing " - f"`max_model_len` when initializing the engine." + f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " + f"when initializing the engine. " + f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + f"for more details." ) diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index d858e840039c4..c3d504f2e72c3 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -106,6 +106,7 @@ class Scheduler(SchedulerInterface): # KV Connector pushes/pull of remote KVs for P/D and offloading. self.connector = None self.connector_prefix_cache_stats: PrefixCacheStats | None = None + self.recompute_kv_load_failures = True if self.vllm_config.kv_transfer_config is not None: assert not self.is_encoder_decoder, ( "Encoder-decoder models are not currently supported with KV connectors" @@ -117,6 +118,10 @@ class Scheduler(SchedulerInterface): ) if self.log_stats: self.connector_prefix_cache_stats = PrefixCacheStats() + kv_load_failure_policy = ( + self.vllm_config.kv_transfer_config.kv_load_failure_policy + ) + self.recompute_kv_load_failures = kv_load_failure_policy == "recompute" self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, @@ -1066,7 +1071,7 @@ class Scheduler(SchedulerInterface): for req_id, num_tokens_scheduled in num_scheduled_tokens.items(): assert num_tokens_scheduled > 0 if failed_kv_load_req_ids and req_id in failed_kv_load_req_ids: - # Skip requests that were recovered from KV load failure + # skip failed or rescheduled requests from KV load failure continue request = self.requests.get(req_id) if request is None: @@ -1177,6 +1182,21 @@ class Scheduler(SchedulerInterface): # This is a rare case and unlikely to impact performance. self.waiting.remove_requests(stopped_preempted_reqs) + if failed_kv_load_req_ids and not self.recompute_kv_load_failures: + requests = [self.requests[req_id] for req_id in failed_kv_load_req_ids] + self.finish_requests(failed_kv_load_req_ids, RequestStatus.FINISHED_ERROR) + for request in requests: + outputs[request.client_index].append( + EngineCoreOutput( + request_id=request.request_id, + new_token_ids=[], + finish_reason=request.get_finished_reason(), + events=request.take_events(), + trace_headers=request.trace_headers, + num_cached_tokens=request.num_cached_tokens, + ) + ) + # KV Connector: update state for finished KV Transfers. if kv_connector_output: self._update_from_kv_xfer_finished(kv_connector_output) @@ -1610,8 +1630,11 @@ class Scheduler(SchedulerInterface): self._free_blocks(self.requests[req_id]) def _update_requests_with_invalid_blocks( - self, requests: Iterable[Request], invalid_block_ids: set[int] - ) -> tuple[set[str], int]: + self, + requests: Iterable[Request], + invalid_block_ids: set[int], + evict_blocks: bool = True, + ) -> tuple[set[str], int, set[int]]: """ Identify and update requests affected by invalid KV cache blocks. @@ -1623,16 +1646,21 @@ class Scheduler(SchedulerInterface): Args: requests: The set of requests to scan for invalid blocks. invalid_block_ids: IDs of invalid blocks. + evict_blocks: Whether to collect blocks for eviction (False for + async requests which aren't cached yet). Returns: tuple: - affected_req_ids (set[str]): IDs of requests impacted by invalid blocks. - total_affected_tokens (int): Total number of tokens that must - be recomputed across all affected requests (for observability). + be recomputed across all affected requests. + - blocks_to_evict (set[int]): Block IDs to evict from cache, + including invalid blocks and downstream dependent blocks. """ affected_req_ids: set[str] = set() total_affected_tokens = 0 + blocks_to_evict: set[int] = set() # If a block is invalid and shared by multiple requests in the batch, # these requests must be rescheduled, but only the first will recompute # it. This set tracks blocks already marked for recomputation. @@ -1690,6 +1718,9 @@ class Scheduler(SchedulerInterface): ) total_affected_tokens += num_affected_tokens request.num_external_computed_tokens -= num_affected_tokens + # collect invalid block and all downstream dependent blocks + if evict_blocks: + blocks_to_evict.update(req_block_ids[idx:]) if is_affected: if not marked_invalid_block: @@ -1705,47 +1736,70 @@ class Scheduler(SchedulerInterface): affected_req_ids.add(request.request_id) - return affected_req_ids, total_affected_tokens + return affected_req_ids, total_affected_tokens, blocks_to_evict def _handle_invalid_blocks(self, invalid_block_ids: set[int]) -> set[str]: - total_requests_to_reschedule = 0 - total_tokens_to_reschedule = 0 + """ + Handle requests affected by invalid KV cache blocks. - # --- Handle async KV loads (WAITING_FOR_REMOTE_KVS) --- + Returns: + Set of affected request IDs to skip in update_from_output main loop. + """ + should_fail = not self.recompute_kv_load_failures + + # handle async KV loads (not cached yet, evict_blocks=False) async_load_reqs = ( req for req in self.waiting if req.status == RequestStatus.WAITING_FOR_REMOTE_KVS ) - async_affected_req_ids, num_tokens_to_reschedule = ( + async_failed_req_ids, num_failed_tokens, _ = ( self._update_requests_with_invalid_blocks( - async_load_reqs, invalid_block_ids + async_load_reqs, invalid_block_ids, evict_blocks=False ) ) - total_requests_to_reschedule += len(async_affected_req_ids) - total_tokens_to_reschedule += num_tokens_to_reschedule + total_failed_requests = len(async_failed_req_ids) + total_failed_tokens = num_failed_tokens - # Mark requests with async KV load failures; they will be rescheduled - # once loading completes. - self.failed_recving_kv_req_ids |= async_affected_req_ids - - # --- Handle sync KV loads (running requests) --- - sync_affected_req_ids, num_tokens_to_reschedule = ( - self._update_requests_with_invalid_blocks(self.running, invalid_block_ids) + # handle sync loads (may be cached, collect blocks for eviction) + sync_failed_req_ids, num_failed_tokens, sync_blocks_to_evict = ( + self._update_requests_with_invalid_blocks( + self.running, invalid_block_ids, evict_blocks=True + ) ) - total_requests_to_reschedule += len(sync_affected_req_ids) - total_tokens_to_reschedule += num_tokens_to_reschedule + total_failed_requests += len(sync_failed_req_ids) + total_failed_tokens += num_failed_tokens - if total_requests_to_reschedule: - logger.warning( - "Recovered from KV load failure: " - "%d request(s) rescheduled (%d tokens affected).", - total_requests_to_reschedule, - total_tokens_to_reschedule, + if not total_failed_requests: + return set() + + # evict invalid blocks and downstream dependent blocks from cache + # only when not using recompute policy (where blocks will be recomputed + # and reused by other requests sharing them) + if sync_blocks_to_evict and not self.recompute_kv_load_failures: + self.kv_cache_manager.evict_blocks(sync_blocks_to_evict) + + if should_fail: + all_failed_req_ids = async_failed_req_ids | sync_failed_req_ids + logger.error( + "Failing %d request(s) due to KV load failure " + "(failure_policy=fail, %d tokens affected). Request IDs: %s", + total_failed_requests, + total_failed_tokens, + all_failed_req_ids, ) + return all_failed_req_ids - # Return the IDs of affected running requests to skip in - # update_from_output. - return sync_affected_req_ids + logger.warning( + "Recovered from KV load failure: " + "%d request(s) rescheduled (%d tokens affected).", + total_failed_requests, + total_failed_tokens, + ) + + # Mark async requests with KV load failures for retry once loading completes + self.failed_recving_kv_req_ids |= async_failed_req_ids + # Return sync affected IDs to skip in update_from_output + return sync_failed_req_ids diff --git a/vllm/v1/cudagraph_dispatcher.py b/vllm/v1/cudagraph_dispatcher.py index ef0f8d9e67452..8a3500c0aac6b 100644 --- a/vllm/v1/cudagraph_dispatcher.py +++ b/vllm/v1/cudagraph_dispatcher.py @@ -145,7 +145,7 @@ class CudagraphDispatcher: num_tokens: int, uniform_decode: bool, has_lora: bool, - use_cascade_attn: bool = False, + disable_full: bool = False, ) -> tuple[CUDAGraphMode, BatchDescriptor]: """ Given conditions(e.g.,batch descriptor and if using cascade attention), @@ -165,7 +165,7 @@ class CudagraphDispatcher: ) relaxed_batch_desc = batch_desc.relax_for_mixed_batch_cudagraphs() - if not use_cascade_attn: + if not disable_full: # check if key exists for full cudagraph if batch_desc in self.cudagraph_keys[CUDAGraphMode.FULL]: return CUDAGraphMode.FULL, batch_desc diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py index ce2aae77108da..4f54d12f4b8d0 100644 --- a/vllm/v1/engine/__init__.py +++ b/vllm/v1/engine/__init__.py @@ -19,24 +19,27 @@ from vllm.v1.serial_utils import UtilityResult # These are possible values of RequestOutput.finish_reason, # so form part of the external API. -FINISH_REASON_STRINGS = ("stop", "length", "abort") +FINISH_REASON_STRINGS = ("stop", "length", "abort", "error") class FinishReason(enum.IntEnum): """ - Reason a request finished - stop, length, or abort. + Reason a request finished - stop, length, abort, or error. Int rather than Str for more compact serialization. stop - a stop string was emitted length - max_tokens was consumed, or max_model_len was reached - abort - aborted for another reason + abort - aborted by client + error - retryable request-level internal error (e.g., KV load failure). + Invariant: always converted to 500 Internal Server Error. """ STOP = 0 LENGTH = 1 ABORT = 2 + ERROR = 3 def __str__(self): return FINISH_REASON_STRINGS[self.value] diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fd7e04dc02082..8eff61563ccea 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -166,32 +166,24 @@ class AsyncLLM(EngineClient): pass if ( - envs.VLLM_TORCH_PROFILER_DIR - and not envs.VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM + vllm_config.profiler_config.profiler == "torch" + and not vllm_config.profiler_config.ignore_frontend ): + profiler_dir = vllm_config.profiler_config.torch_profiler_dir logger.info( "Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501 - envs.VLLM_TORCH_PROFILER_DIR, + profiler_dir, ) - if envs.VLLM_PROFILER_MAX_ITERS > 0 or envs.VLLM_PROFILER_DELAY_ITERS > 0: - logger.warning_once( - "Torch profiler received max_iters or delay_iters setting. These " - "are not compatible with the AsyncLLM profiler and will be ignored " - "for the AsyncLLM process. Engine process profiling will still " - "respect these settings. Consider setting " - "VLLM_TORCH_PROFILER_DISABLE_ASYNC_LLM=1 to disable " - "AsyncLLM profiling." - ) worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm" self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, ], - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, + with_stack=vllm_config.profiler_config.torch_profiler_with_stack, on_trace_ready=torch.profiler.tensorboard_trace_handler( - envs.VLLM_TORCH_PROFILER_DIR, + profiler_dir, worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, + use_gzip=vllm_config.profiler_config.torch_profiler_use_gzip, ), ) else: @@ -200,7 +192,7 @@ class AsyncLLM(EngineClient): @property @deprecated( "`AsyncLLM.processor` has been renamed to `AsyncLLM.input_processor`. " - "The old name will be removed in v0.13." + "The old name will be removed in v0.14." ) def processor(self): return self.input_processor @@ -709,10 +701,6 @@ class AsyncLLM(EngineClient): def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_processor.tokenizer = tokenizer - async def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 3d3a1e138ddef..0045b8c1dd3e7 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -211,6 +211,9 @@ class EngineCore: freeze_gc_heap() # If enable, attach GC debugger after static variable freeze. maybe_attach_gc_debug_callback() + # Enable environment variable cache (e.g. assume no more + # environment variable overrides after this point) + enable_envs_cache() def _initialize_kv_caches( self, vllm_config: VllmConfig @@ -672,10 +675,6 @@ class EngineCoreProc(EngineCore): assert addresses.coordinator_input is not None logger.info("Waiting for READY message from DP Coordinator...") - # Enable environment variable cache (e.g. assume no more - # environment variable overrides after this point) - enable_envs_cache() - @contextmanager def _perform_handshakes( self, diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index e6a94f4e3de5d..a3c18464d3f52 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -64,10 +64,6 @@ class InputProcessor: def tokenizer(self) -> TokenizerLike | None: return self.input_preprocessor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_preprocessor.tokenizer = tokenizer - def _validate_logprobs( self, params: SamplingParams, diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c31291005477..4422eced82fea 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -139,7 +139,7 @@ class LLMEngine: @property @deprecated( "`LLMEngine.processor` has been renamed to `LLMEngine.input_processor`. " - "The old name will be removed in v0.13." + "The old name will be removed in v0.14." ) def processor(self): return self.input_processor @@ -358,10 +358,6 @@ class LLMEngine: def tokenizer(self) -> TokenizerLike | None: return self.input_processor.tokenizer - @tokenizer.setter - def tokenizer(self, tokenizer: TokenizerLike | None) -> None: - self.input_processor.tokenizer = tokenizer - def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index e85fbb4ee0fb0..9be3f4da7352d 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -650,6 +650,7 @@ class OutputProcessor: ), max_tokens_param=req_state.max_tokens_param, req_stats=req_state.stats, + num_cached_tokens=req_state.num_cached_tokens, ) self.lora_states.request_finished(req_state.request_id, req_state.lora_name) diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index bc5c7fc400fde..a8c93499299d3 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -10,7 +10,7 @@ def __getattr__(name: str): warnings.warn( "`vllm.v1.engine.processor.Processor` has been moved to " "`vllm.v1.engine.input_processor.InputProcessor`. " - "The old name will be removed in v0.13.", + "The old name will be removed in v0.14.", DeprecationWarning, stacklevel=2, ) diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py index 882e0ce0b2e03..9eaee1bb97bb9 100644 --- a/vllm/v1/metrics/loggers.py +++ b/vllm/v1/metrics/loggers.py @@ -870,6 +870,19 @@ class PrometheusStatLogger(AggregateStatLoggerBase): histogram_decode_time_request, engine_indexes, model_name ) + histogram_prefill_kv_computed_request = self._histogram_cls( + name="vllm:request_prefill_kv_computed_tokens", + documentation=( + "Histogram of new KV tokens computed during prefill " + "(excluding cached tokens)." + ), + buckets=build_1_2_5_buckets(max_model_len), + labelnames=labelnames, + ) + self.histogram_prefill_kv_computed_request = make_per_engine( + histogram_prefill_kv_computed_request, engine_indexes, model_name + ) + # # KV Cache residency metrics # @@ -1118,6 +1131,13 @@ class PrometheusStatLogger(AggregateStatLoggerBase): self.histogram_decode_time_request[engine_idx].observe( finished_request.decode_time ) + # Calculate prefill KV compute (excludes cached tokens) + prefill_kv_computed = finished_request.num_prompt_tokens - max( + finished_request.num_cached_tokens, 0 + ) + self.histogram_prefill_kv_computed_request[engine_idx].observe( + prefill_kv_computed + ) self.histogram_num_prompt_tokens_request[engine_idx].observe( finished_request.num_prompt_tokens ) diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py index 733d3ae12e67f..a0cc58d0a64e8 100644 --- a/vllm/v1/metrics/stats.py +++ b/vllm/v1/metrics/stats.py @@ -224,6 +224,7 @@ class FinishedRequestStats: decode_time: float = 0.0 mean_time_per_output_token: float = 0.0 is_corrupted: bool = False + num_cached_tokens: int = 0 class IterationStats: @@ -330,6 +331,7 @@ class IterationStats: num_prompt_tokens: int, max_tokens_param: int | None, req_stats: RequestStateStats, + num_cached_tokens: int = 0, ): e2e_latency = self._time_since(req_stats.arrival_time) @@ -367,6 +369,7 @@ class IterationStats: decode_time=decode_time, mean_time_per_output_token=mean_time_per_output_token, is_corrupted=req_stats.is_corrupted, + num_cached_tokens=num_cached_tokens, ) self.finished_requests.append(finished_req) diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py index 546eacebf83e5..bea9e5846de13 100644 --- a/vllm/v1/outputs.py +++ b/vllm/v1/outputs.py @@ -12,9 +12,11 @@ from vllm.compilation.cuda_graph import CUDAGraphStat from vllm.v1.core.sched.output import SchedulerOutput if TYPE_CHECKING: + from vllm.distributed.kv_events import KVConnectorKVEvents from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats else: KVConnectorStats = object + KVConnectorKVEvents = object class LogprobsLists(NamedTuple): @@ -108,6 +110,7 @@ class KVConnectorOutput: finished_sending: set[str] | None = None finished_recving: set[str] | None = None kv_connector_stats: KVConnectorStats | None = None + kv_cache_events: KVConnectorKVEvents | None = None # IDs of externally computed KV blocks that failed to load. # Requests referencing these blocks should be rescheduled to recompute them invalid_block_ids: set[int] = field(default_factory=set) @@ -123,6 +126,7 @@ class KVConnectorOutput: not self.finished_sending and not self.finished_recving and not self.kv_connector_stats + and not self.kv_cache_events and not self.invalid_block_ids ) diff --git a/vllm/v1/request.py b/vllm/v1/request.py index 33762fe34e64f..a775e840e841c 100644 --- a/vllm/v1/request.py +++ b/vllm/v1/request.py @@ -255,6 +255,7 @@ class RequestStatus(enum.IntEnum): FINISHED_LENGTH_CAPPED = enum.auto() FINISHED_ABORTED = enum.auto() FINISHED_IGNORED = enum.auto() + FINISHED_ERROR = enum.auto() def __str__(self): return self.name @@ -277,4 +278,5 @@ _FINISHED_REASON_MAP = { RequestStatus.FINISHED_LENGTH_CAPPED: FinishReason.LENGTH, RequestStatus.FINISHED_ABORTED: FinishReason.ABORT, RequestStatus.FINISHED_IGNORED: FinishReason.LENGTH, + RequestStatus.FINISHED_ERROR: FinishReason.ERROR, } diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 31428db2d3afc..65a0a88ec0f5d 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -178,6 +178,12 @@ class EagleProposer: ) rocm_types.append(AiterFlashAttentionMetadata) + + # TRITON_MLA backend support for MLA models (e.g., DeepSeek) + from vllm.v1.attention.backends.mla.common import MLACommonMetadata + + rocm_types.append(MLACommonMetadata) + self.allowed_attn_types = tuple(rocm_types) # Parse the speculative token tree. @@ -440,16 +446,16 @@ class EagleProposer: # of main model. # Increment the sequence lengths. common_attn_metadata.seq_lens += 1 - # This is an out-of-place operation to avoid modifying the original tensor. - common_attn_metadata.seq_lens_cpu = common_attn_metadata.seq_lens_cpu + 1 # For the requests that exceed the max model length, we set the # sequence length to 1 to minimize their overheads in attention. - common_attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1) - common_attn_metadata.num_computed_tokens_cpu = ( - common_attn_metadata.seq_lens_cpu - 1 - ) + # Also update the CPU-side shadow; NOTE: this is hacky and should be + # removed in when common_attn_metadata.seq_lens_cpu is deprecated. + if common_attn_metadata._seq_lens_cpu is not None: + common_attn_metadata._seq_lens_cpu += 1 + if common_attn_metadata._num_computed_tokens_cpu is not None: + common_attn_metadata._num_computed_tokens_cpu += 1 # Compute the slot mapping. if self.uses_mrope: @@ -656,8 +662,8 @@ class EagleProposer: query_start_loc=common_attn_metadata.query_start_loc, seq_lens=common_attn_metadata.seq_lens, query_start_loc_cpu=query_start_loc_cpu, - seq_lens_cpu=common_attn_metadata.seq_lens_cpu, - num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu, + _seq_lens_cpu=common_attn_metadata._seq_lens_cpu, + _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu, num_reqs=common_attn_metadata.num_reqs, num_actual_tokens=total_num_tokens, max_query_len=new_query_len_per_req.max().item(), @@ -932,8 +938,8 @@ class EagleProposer: query_start_loc=new_query_start_loc_cpu.to(device, non_blocking=True), seq_lens=new_seq_lens_cpu.to(device, non_blocking=True), query_start_loc_cpu=new_query_start_loc_cpu, - seq_lens_cpu=new_seq_lens_cpu, - num_computed_tokens_cpu=common_attn_metadata.num_computed_tokens_cpu, + _seq_lens_cpu=new_seq_lens_cpu, + _num_computed_tokens_cpu=common_attn_metadata._num_computed_tokens_cpu, num_reqs=common_attn_metadata.num_reqs, num_actual_tokens=total_num_tokens, max_query_len=new_query_len_per_req.max().item(), @@ -1258,7 +1264,7 @@ class EagleProposer: num_tokens_padded: int, ) -> tuple[int, torch.Tensor]: # TODO(Flechman): support DBO ubatching - should_ubatch, num_toks_across_dp = coordinate_batch_across_dp( + should_ubatch, num_toks_across_dp, _ = coordinate_batch_across_dp( num_tokens_unpadded=num_tokens_unpadded, parallel_config=self.vllm_config.parallel_config, allow_microbatching=False, diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py index 12b903ccaca97..989478f348161 100644 --- a/vllm/v1/spec_decode/medusa.py +++ b/vllm/v1/spec_decode/medusa.py @@ -38,16 +38,16 @@ class MedusaProposer: self, target_hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, - ) -> list[list[int]]: + ) -> torch.Tensor: # Generate blocks and compute logits blocks = self.model(target_hidden_states) logits = self.model.compute_logits(blocks) - # Get draft tokens and transpose the result - # TODO(woosuk): OPTIMIZATION: Return GPU tensor without GPU-CPU - # synchronization. - draft_tokens = [logit.argmax(dim=-1).tolist() for logit in logits] - return [list(row) for row in zip(*draft_tokens)] + # Compute argmax for each Medusa head and stack into a single tensor + # Shape: [batch_size, num_heads] + draft_tokens = torch.stack([logit.argmax(dim=-1) for logit in logits], dim=1) + + return draft_tokens def load_model(self, target_model: nn.Module) -> None: from vllm.compilation.backends import set_model_tag diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 5ee88178cdf60..4dd478804049b 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -339,7 +339,9 @@ class StructuredOutputManager: return True # Check if reasoning ends in *this* step - if self.reasoner.is_reasoning_end(request.all_token_ids): + if self.reasoner.is_reasoning_end_streaming( + request.all_token_ids, request.all_token_ids[request.num_computed_tokens :] + ): # Reasoning just ended, so we shouldn't advance til # next pass structured_req.reasoning_ended = True diff --git a/vllm/v1/structured_output/backend_xgrammar.py b/vllm/v1/structured_output/backend_xgrammar.py index f8a2df43dd90e..826ee08caa4e2 100644 --- a/vllm/v1/structured_output/backend_xgrammar.py +++ b/vllm/v1/structured_output/backend_xgrammar.py @@ -10,7 +10,7 @@ import torch import vllm.envs from vllm.logger import init_logger from vllm.sampling_params import SamplingParams -from vllm.tokenizers import MistralTokenizer +from vllm.tokenizers import DeepseekV32Tokenizer, MistralTokenizer from vllm.utils.import_utils import LazyLoader from vllm.v1.structured_output.backend_types import ( StructuredOutputBackend, @@ -56,6 +56,27 @@ class XgrammarBackend(StructuredOutputBackend): stop_token_ids=stop_token_ids, add_prefix_space=True, ) + elif isinstance(self.tokenizer, DeepseekV32Tokenizer): + # copy from xgr.TokenizerInfo.from_huggingface() + # because we are using a custom tokenizer wrapper here. + vocab_dict = self.tokenizer.get_vocab() + tokenizer_vocab_size = max(len(vocab_dict), self.tokenizer.max_token_id + 1) + vocab_size = self.vocab_size or tokenizer_vocab_size + # maintain tokenizer's indexing + encoded_vocab = [""] * vocab_size + for token, idx in vocab_dict.items(): + if idx < vocab_size: + encoded_vocab[idx] = token + stop_token_ids = [self.tokenizer.eos_token_id] + backend_str = self.tokenizer.tokenizer.backend_tokenizer.to_str() + metadata = xgr.TokenizerInfo._detect_metadata_from_hf(backend_str) + tokenizer_info = xgr.TokenizerInfo( + encoded_vocab=encoded_vocab, + vocab_type=metadata["vocab_type"], + vocab_size=vocab_size, + stop_token_ids=stop_token_ids, + add_prefix_space=metadata["add_prefix_space"], + ) else: tokenizer_info = xgr.TokenizerInfo.from_huggingface( self.tokenizer, diff --git a/vllm/v1/worker/cp_utils.py b/vllm/v1/worker/cp_utils.py new file mode 100644 index 0000000000000..f666c739b0be7 --- /dev/null +++ b/vllm/v1/worker/cp_utils.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import TYPE_CHECKING, Any, cast + +from vllm.config import VllmConfig, get_layers_from_vllm_config + +if TYPE_CHECKING: + from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase +else: + AttentionLayerBase = object + + +def check_attention_cp_compatibility(vllm_config: VllmConfig) -> None: + pcp_size = vllm_config.parallel_config.prefill_context_parallel_size + dcp_size = vllm_config.parallel_config.decode_context_parallel_size + interleave_size = vllm_config.parallel_config.cp_kv_cache_interleave_size + if pcp_size * dcp_size > 1: + layer_type = cast(type[Any], AttentionLayerBase) + layers = get_layers_from_vllm_config(vllm_config, layer_type) + for layer in layers.values(): + layer_impl = getattr(layer, "impl", None) + if layer_impl is None: + continue + if vllm_config.speculative_config is not None and interleave_size > 1: + assert layer_impl.supports_mtp_with_cp_non_trivial_interleave_size, ( + "MTP with cp_kv_cache_interleave_size > 1 is not " + f"supported in {layer_impl.__class__.__name__}." + ) + if dcp_size > 1: + assert layer_impl.need_to_return_lse_for_decode, ( + "DCP requires attention impls to return" + " the softmax lse for decode, but the impl " + f"{layer_impl.__class__.__name__} " + "does not return the softmax lse for decode." + ) + + if pcp_size > 1: + assert layer_impl.supports_pcp, ( + "PCP requires attention impls' support, " + f"but the impl {layer_impl.__class__.__name__} " + "does not support PCP." + ) diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index b080fea1d2dd6..e54b995ab908f 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -13,6 +13,7 @@ from vllm.logger import init_logger from vllm.model_executor.utils import set_random_seed from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo +from vllm.profiler.wrapper import TorchProfilerWrapper from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment @@ -38,30 +39,17 @@ class CPUWorker(Worker): self.parallel_config.disable_custom_all_reduce = True - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # Torch profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, + self.profiler = TorchProfilerWrapper( + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU"], ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, worker_name=worker_name, use_gzip=False - ), - ) - else: - self.profiler = None def init_device(self): # Setup OpenMP threads affinity. @@ -202,9 +190,3 @@ class CPUWorker(Worker): self.profiler.start() else: self.profiler.stop() - if self.local_rank == 0: - logger.info( - self.profiler.key_averages().table( - sort_by="self_cpu_time_total", row_limit=50 - ) - ) diff --git a/vllm/v1/worker/dp_utils.py b/vllm/v1/worker/dp_utils.py index 5da55d740c347..1b9646e1980a8 100644 --- a/vllm/v1/worker/dp_utils.py +++ b/vllm/v1/worker/dp_utils.py @@ -40,16 +40,18 @@ def _run_ar( should_dp_pad: bool, orig_num_tokens_per_ubatch: int, padded_num_tokens_per_ubatch: int, + cudagraph_mode: int, parallel_config: ParallelConfig, ) -> torch.Tensor: dp_size = parallel_config.data_parallel_size dp_rank = parallel_config.data_parallel_rank device, group = _get_device_and_group(parallel_config) - tensor = torch.zeros(4, dp_size, device=device, dtype=torch.int32) + tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32) tensor[0][dp_rank] = orig_num_tokens_per_ubatch tensor[1][dp_rank] = padded_num_tokens_per_ubatch tensor[2][dp_rank] = 1 if should_ubatch else 0 tensor[3][dp_rank] = 1 if should_dp_pad else 0 + tensor[4][dp_rank] = cudagraph_mode dist.all_reduce(tensor, group=group) return tensor @@ -89,13 +91,23 @@ def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch return num_tokens_across_dp.cpu() +def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int: + """ + Synchronize cudagraph_mode across DP ranks by taking the minimum. + If any rank has NONE (0), all ranks use NONE. + This ensures all ranks send consistent values (all padded or all unpadded). + """ + return int(tensor[4, :].min().item()) + + def _synchronize_dp_ranks( num_tokens_unpadded: int, num_tokens_padded: int, should_attempt_ubatching: bool, should_attempt_dp_padding: bool, + cudagraph_mode: int, parallel_config: ParallelConfig, -) -> tuple[bool, torch.Tensor | None]: +) -> tuple[bool, torch.Tensor | None, int]: """ 1. Decides if each DP rank is going to microbatch. Either all ranks run with microbatching or none of them do. @@ -104,10 +116,13 @@ def _synchronize_dp_ranks( When running microbatched or if should_attempt_dp_padding is True, all ranks will be padded out so that the run with the same number of tokens + 3. Synchronizes cudagraph_mode across ranks by taking the minimum. + Returns: tuple[ should_ubatch: Are all DP ranks going to microbatch num_tokens_after_padding: A tensor containing the total number of tokens per-microbatch for each DP rank including any DP padding. + synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) ] """ @@ -121,6 +136,7 @@ def _synchronize_dp_ranks( should_dp_pad=should_attempt_dp_padding, orig_num_tokens_per_ubatch=num_tokens_unpadded, padded_num_tokens_per_ubatch=num_tokens_padded, + cudagraph_mode=cudagraph_mode, parallel_config=parallel_config, ) @@ -148,7 +164,10 @@ def _synchronize_dp_ranks( should_dp_pad, ) - return should_ubatch, num_tokens_after_padding + # Synchronize cudagraph_mode across ranks (take min) + synced_cudagraph_mode = _post_process_cudagraph_mode(tensor) + + return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode def coordinate_batch_across_dp( @@ -159,7 +178,8 @@ def coordinate_batch_across_dp( num_tokens_padded: int | None = None, uniform_decode: bool | None = None, num_scheduled_tokens_per_request: np.ndarray | None = None, -) -> tuple[bool, torch.Tensor | None]: + cudagraph_mode: int = 0, +) -> tuple[bool, torch.Tensor | None, int]: """ Coordinates amongst all DP ranks to determine if and how the full batch should be split into microbatches. @@ -175,6 +195,7 @@ def coordinate_batch_across_dp( only contains single token decodes num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The number of tokens per request. + cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL) Returns: tuple[ ubatch_slices: if this is set then all DP ranks have agreed to @@ -183,12 +204,13 @@ def coordinate_batch_across_dp( tokens per-microbatch for each DP rank including padding. Will be padded up to the max value across all DP ranks when allow_dp_padding is True. + synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks) ] """ if parallel_config.data_parallel_size == 1: # Early exit. - return False, None + return False, None, cudagraph_mode # If the caller has explicitly enabled microbatching. should_attempt_ubatching = False @@ -204,12 +226,15 @@ def coordinate_batch_across_dp( if num_tokens_padded is None: num_tokens_padded = num_tokens_unpadded - (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks( - num_tokens_unpadded, - num_tokens_padded, - should_attempt_ubatching, - allow_dp_padding, - parallel_config, + (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = ( + _synchronize_dp_ranks( + num_tokens_unpadded, + num_tokens_padded, + should_attempt_ubatching, + allow_dp_padding, + cudagraph_mode, + parallel_config, + ) ) - return (should_ubatch, num_tokens_after_padding) + return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) diff --git a/vllm/v1/worker/gpu/async_utils.py b/vllm/v1/worker/gpu/async_utils.py index f6bc607c1ae67..a2e3decad0486 100644 --- a/vllm/v1/worker/gpu/async_utils.py +++ b/vllm/v1/worker/gpu/async_utils.py @@ -2,14 +2,15 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from contextlib import contextmanager +import numpy as np import torch from vllm.v1.outputs import ( AsyncModelRunnerOutput, LogprobsTensors, ModelRunnerOutput, - SamplerOutput, ) +from vllm.v1.worker.gpu.sample.output import SamplerOutput class AsyncOutput(AsyncModelRunnerOutput): @@ -34,29 +35,18 @@ class AsyncOutput(AsyncModelRunnerOutput): with torch.cuda.stream(self.copy_stream): self.copy_stream.wait_stream(default_stream) - # NOTE(woosuk): We must ensure that CPU tensors are not freed - # before the device-to-host copy is fully completed. For instance, - # operations like - # self.sampled_token_np = ...to("cpu", non_blocking=True).numpy() - # are unsafe because the underlying CPU tensor can be prematurely freed and - # reused by other tensors before the asynchronous copy finishes, potentially - # causing race conditions. To prevent this, we delay freeing by holding - # references until the copy event signals completion. - # Likewise, we also need to keep the reference to the GPU tensors. - # This is done by keeping the reference to sampler_output and - # model_runner_output. - self.sampled_token_ids = sampler_output.sampled_token_ids.to( - "cpu", non_blocking=True - ) + self.sampled_token_ids = async_copy_to_np(sampler_output.sampled_token_ids) if sampler_output.logprobs_tensors is not None: self.logprobs_tensors: LogprobsTensors | None = ( sampler_output.logprobs_tensors.to_cpu_nonblocking() ) else: self.logprobs_tensors = None - self.num_sampled_tokens_cpu = num_sampled_tokens.to( - "cpu", non_blocking=True - ) + if sampler_output.num_nans is not None: + self.num_nans = async_copy_to_np(sampler_output.num_nans) + else: + self.num_nans = None + self.num_sampled_tokens_np = async_copy_to_np(num_sampled_tokens) self.prompt_logprobs_dict: dict[str, LogprobsTensors | None] = {} if self.model_runner_output.prompt_logprobs_dict: for k, v in self.model_runner_output.prompt_logprobs_dict.items(): @@ -68,7 +58,6 @@ class AsyncOutput(AsyncModelRunnerOutput): def get_output(self) -> ModelRunnerOutput: self.copy_event.synchronize() - num_sampled_tokens_np = self.num_sampled_tokens_cpu.numpy() # NOTE(woosuk): The following code is to ensure compatibility with # the existing model runner. @@ -76,10 +65,18 @@ class AsyncOutput(AsyncModelRunnerOutput): # rather than Python lists. sampled_token_ids: list[list[int]] = self.sampled_token_ids.tolist() num_reqs = len(sampled_token_ids) + num_sampled_tokens = self.num_sampled_tokens_np.tolist() for i in range(num_reqs): - del sampled_token_ids[i][num_sampled_tokens_np[i] :] + del sampled_token_ids[i][num_sampled_tokens[i] :] self.model_runner_output.sampled_token_ids = sampled_token_ids + if self.num_nans is not None: + num_nans = self.num_nans.tolist() + self.model_runner_output.num_nans_in_logits = { + req_id: num_nans[i] + for i, req_id in enumerate(self.model_runner_output.req_ids) + } + if self.logprobs_tensors is not None: self.model_runner_output.logprobs = self.logprobs_tensors.tolists() self.model_runner_output.prompt_logprobs_dict = self.prompt_logprobs_dict @@ -95,3 +92,7 @@ def async_barrier(event: torch.cuda.Event | None): finally: if event is not None: event.record() + + +def async_copy_to_np(x: torch.Tensor) -> np.ndarray: + return x.to("cpu", non_blocking=True).numpy() diff --git a/vllm/v1/worker/gpu/attn_utils.py b/vllm/v1/worker/gpu/attn_utils.py index 5aa1a33d851cc..6386f1a08b446 100644 --- a/vllm/v1/worker/gpu/attn_utils.py +++ b/vllm/v1/worker/gpu/attn_utils.py @@ -168,9 +168,9 @@ def build_attn_metadata( query_start_loc=query_start_loc_gpu, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, + _seq_lens_cpu=seq_lens_cpu, max_seq_len=max_seq_len, - num_computed_tokens_cpu=num_computed_tokens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, num_reqs=num_reqs, num_actual_tokens=num_tokens, max_query_len=max_query_len, diff --git a/vllm/v1/worker/gpu/metrics/__init__.py b/vllm/v1/worker/gpu/metrics/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/vllm/v1/worker/gpu/metrics/logits.py b/vllm/v1/worker/gpu/metrics/logits.py new file mode 100644 index 0000000000000..fd7b30beaa1f8 --- /dev/null +++ b/vllm/v1/worker/gpu/metrics/logits.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import torch +from torch._inductor.runtime.triton_helpers import libdevice + +from vllm.triton_utils import tl, triton + + +@triton.jit +def _num_nans_kernel( + logits_ptr, + logits_stride, + num_nans_ptr, + vocab_size, + BLOCK_SIZE: tl.constexpr, +): + req_idx = tl.program_id(0) + num_nans = 0 + for i in range(0, vocab_size, BLOCK_SIZE): + block = i + tl.arange(0, BLOCK_SIZE) + mask = block < vocab_size + logits = tl.load( + logits_ptr + req_idx * logits_stride + block, mask=mask, other=0 + ) + logits = logits.to(tl.float32) + is_nan = libdevice.isnan(logits).to(tl.int1) + num_nans += tl.sum(is_nan).to(tl.int32) + tl.store(num_nans_ptr + req_idx, num_nans) + + +def get_num_nans(logits: torch.Tensor) -> torch.Tensor: + num_reqs, vocab_size = logits.shape + BLOCK_SIZE = 8192 + num_nans = torch.empty(num_reqs, dtype=torch.int32, device=logits.device) + _num_nans_kernel[(num_reqs,)]( + logits, + logits.stride(0), + num_nans, + vocab_size, + BLOCK_SIZE=BLOCK_SIZE, + ) + return num_nans diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 464f7b7bd3532..9f4c6edfb6aa9 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -25,7 +25,6 @@ from vllm.v1.outputs import ( LogprobsTensors, ModelRunnerOutput, ) -from vllm.v1.sample.sampler import SamplerOutput from vllm.v1.worker.gpu.async_utils import AsyncOutput, async_barrier from vllm.v1.worker.gpu.attn_utils import ( build_attn_metadata, @@ -53,6 +52,7 @@ from vllm.v1.worker.gpu.sample.metadata import ( SamplingMetadata, expand_sampling_metadata, ) +from vllm.v1.worker.gpu.sample.output import SamplerOutput from vllm.v1.worker.gpu.sample.sampler import Sampler from vllm.v1.worker.gpu.spec_decode import init_speculator from vllm.v1.worker.gpu.spec_decode.rejection_sample import rejection_sample diff --git a/vllm/v1/worker/gpu/sample/min_p.py b/vllm/v1/worker/gpu/sample/min_p.py index 0638818006f50..c98a42cb2b1bb 100644 --- a/vllm/v1/worker/gpu/sample/min_p.py +++ b/vllm/v1/worker/gpu/sample/min_p.py @@ -39,9 +39,7 @@ def _min_p_kernel( tl.store(logits_ptr + req_idx * logits_stride + block, logits, mask=mask) -def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor | None) -> None: - if min_p is None: - return +def apply_min_p(logits: torch.Tensor, min_p: torch.Tensor) -> None: num_reqs, vocab_size = logits.shape BLOCK_SIZE = 1024 _min_p_kernel[(num_reqs,)]( diff --git a/vllm/v1/worker/gpu/sample/output.py b/vllm/v1/worker/gpu/sample/output.py new file mode 100644 index 0000000000000..13e8cf1d6c1ec --- /dev/null +++ b/vllm/v1/worker/gpu/sample/output.py @@ -0,0 +1,14 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from dataclasses import dataclass + +import torch + +from vllm.v1.outputs import LogprobsTensors + + +@dataclass +class SamplerOutput: + sampled_token_ids: torch.Tensor + logprobs_tensors: LogprobsTensors | None + num_nans: torch.Tensor | None diff --git a/vllm/v1/worker/gpu/sample/penalties.py b/vllm/v1/worker/gpu/sample/penalties.py index c8d4b7d81841d..b4fcc822ecfce 100644 --- a/vllm/v1/worker/gpu/sample/penalties.py +++ b/vllm/v1/worker/gpu/sample/penalties.py @@ -62,6 +62,7 @@ def _penalties_and_temperature_kernel( mask=packed_block < tl.cdiv(vocab_size, 32), ) prompt_bin_mask = (packed_mask[:, None] >> (tl.arange(0, 32)[None, :])) & 1 + prompt_bin_mask = prompt_bin_mask.to(tl.int1) prompt_bin_mask = prompt_bin_mask.reshape(BLOCK_SIZE) # If token appears in prompt or output, apply, otherwise use 1.0 for no-op. diff --git a/vllm/v1/worker/gpu/sample/sampler.py b/vllm/v1/worker/gpu/sample/sampler.py index 9a4224d8fddef..84a3e18671b2c 100644 --- a/vllm/v1/worker/gpu/sample/sampler.py +++ b/vllm/v1/worker/gpu/sample/sampler.py @@ -3,13 +3,15 @@ import torch +import vllm.envs as envs from vllm.config.model import LogprobsMode -from vllm.v1.outputs import SamplerOutput from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p +from vllm.v1.worker.gpu.metrics.logits import get_num_nans from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample from vllm.v1.worker.gpu.sample.logprob import compute_topk_logprobs from vllm.v1.worker.gpu.sample.metadata import SamplingMetadata from vllm.v1.worker.gpu.sample.min_p import apply_min_p +from vllm.v1.worker.gpu.sample.output import SamplerOutput from vllm.v1.worker.gpu.sample.penalties import apply_penalties_and_temperature @@ -21,12 +23,16 @@ class Sampler: if logprobs_mode not in ["processed_logprobs", "raw_logprobs"]: raise NotImplementedError(f"Unsupported logprobs_mode: {logprobs_mode}") self.logprobs_mode = logprobs_mode + self.compute_nans = envs.VLLM_COMPUTE_NANS_IN_LOGITS # False by default. def __call__( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> SamplerOutput: + # NOTE(woosuk): We intentionally compute num_nans before sampling to make clear + # that num_nans is computed before applying penalties and temperature. + num_nans = get_num_nans(logits) if self.compute_nans else None sampled, processed_logits = self.sample(logits, sampling_metadata) if sampling_metadata.max_num_logprobs is not None: logits = ( @@ -49,6 +55,7 @@ class Sampler: # token per request. sampled_token_ids=sampled.view(-1, 1), logprobs_tensors=logprobs_tensors, + num_nans=num_nans, ) return sampler_output @@ -63,7 +70,8 @@ class Sampler: # Apply penalties and temperature in place. apply_penalties_and_temperature(logits, sampling_metadata) # Apply min_p in place. - apply_min_p(logits, sampling_metadata.min_p) + if sampling_metadata.min_p is not None: + apply_min_p(logits, sampling_metadata.min_p) # Apply top_k and/or top_p. This might return a new tensor. logits = apply_top_k_top_p( logits, sampling_metadata.top_k, sampling_metadata.top_p diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 22a3f9d8d2dda..0e2bf9df9a18f 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -148,6 +148,7 @@ from vllm.v1.spec_decode.ngram_proposer import NgramProposer from vllm.v1.spec_decode.suffix_decoding import SuffixDecodingProposer from vllm.v1.structured_output.utils import apply_grammar_bitmask from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext +from vllm.v1.worker.cp_utils import check_attention_cp_compatibility from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ec_connector_model_runner_mixin import ECConnectorModelRunnerMixin from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch @@ -1267,6 +1268,8 @@ class GPUModelRunner( if not isinstance(kv_cache_spec, CrossAttentionSpec): return None, None + # Zero out buffer for padding requests that are not actually scheduled (CGs) + self.encoder_seq_lens.np[:num_reqs] = 0 # Build encoder_seq_lens array mapping request indices to # encoder lengths for inputs scheduled in this batch for req_id in num_scheduled_tokens: @@ -1626,8 +1629,8 @@ class GPUModelRunner( query_start_loc=query_start_loc, query_start_loc_cpu=query_start_loc_cpu, seq_lens=seq_lens, - seq_lens_cpu=seq_lens_cpu, - num_computed_tokens_cpu=num_computed_tokens_cpu, + _seq_lens_cpu=seq_lens_cpu, + _num_computed_tokens_cpu=num_computed_tokens_cpu, num_actual_tokens=num_tokens_padded, num_reqs=num_reqs_padded, max_query_len=max_query_len, @@ -2764,6 +2767,7 @@ class GPUModelRunner( # be improved in model runner v2) force_uniform_decode: bool | None = None, force_has_lora: bool | None = None, + num_encoder_reqs: int = 0, ) -> tuple[ CUDAGraphMode, BatchDescriptor, @@ -2780,6 +2784,11 @@ class GPUModelRunner( if force_uniform_decode is None else force_uniform_decode ) + # Encoder-decoder models only support CG for decoder_step > 0 (no enc_output + # is present). Also, chunked-prefill is disabled, so batch are uniform. + has_encoder_output = ( + self.model_config.is_encoder_decoder and num_encoder_reqs > 0 + ) has_lora = ( len(self.input_batch.lora_id_to_lora_request) > 0 @@ -2788,17 +2797,19 @@ class GPUModelRunner( ) dispatch_cudagraph = ( - lambda num_tokens: self.cudagraph_dispatcher.dispatch( + lambda num_tokens, disable_full: self.cudagraph_dispatcher.dispatch( num_tokens=num_tokens, has_lora=has_lora, - use_cascade_attn=use_cascade_attn, uniform_decode=uniform_decode, + disable_full=disable_full, ) if not force_eager else (CUDAGraphMode.NONE, BatchDescriptor(num_tokens_padded)) ) - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, use_cascade_attn or has_encoder_output + ) num_tokens_padded = batch_descriptor.num_tokens # Extra coordination when running data-parallel since we need to coordinate @@ -2813,23 +2824,28 @@ class GPUModelRunner( self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE ) - should_ubatch, num_tokens_across_dp = coordinate_batch_across_dp( - num_tokens_unpadded=num_tokens, - parallel_config=self.parallel_config, - allow_microbatching=allow_microbatching, - allow_dp_padding=allow_dp_padding, - num_tokens_padded=num_tokens_padded, - uniform_decode=uniform_decode, - num_scheduled_tokens_per_request=num_scheduled_tokens_np, + should_ubatch, num_tokens_across_dp, synced_cudagraph_mode = ( + coordinate_batch_across_dp( + num_tokens_unpadded=num_tokens, + parallel_config=self.parallel_config, + allow_microbatching=allow_microbatching, + allow_dp_padding=allow_dp_padding, + num_tokens_padded=num_tokens_padded, + uniform_decode=uniform_decode, + num_scheduled_tokens_per_request=num_scheduled_tokens_np, + cudagraph_mode=cudagraph_mode.value, + ) ) - # Extract DP padding if there is any + # Extract DP-synced values if num_tokens_across_dp is not None: dp_rank = self.parallel_config.data_parallel_rank num_tokens_padded = int(num_tokens_across_dp[dp_rank].item()) - - # Re-dispatch with DP padding - cudagraph_mode, batch_descriptor = dispatch_cudagraph(num_tokens_padded) + # Re-dispatch with DP padding so we have the correct batch_descriptor + cudagraph_mode, batch_descriptor = dispatch_cudagraph( + num_tokens_padded, + disable_full=synced_cudagraph_mode <= CUDAGraphMode.PIECEWISE.value, + ) # Assert to make sure the agreed upon token count is correct otherwise # num_tokens_across_dp will no-longer be valid assert batch_descriptor.num_tokens == num_tokens_padded @@ -2990,6 +3006,7 @@ class GPUModelRunner( num_scheduled_tokens_np=num_scheduled_tokens_np, max_num_scheduled_tokens=max_num_scheduled_tokens, use_cascade_attn=cascade_attn_prefix_lens is not None, + num_encoder_reqs=len(scheduler_output.scheduled_encoder_inputs), ) logger.debug( @@ -3555,74 +3572,89 @@ class GPUModelRunner( if self.parallel_config.enable_eplb: self.eplb_state = EplbState(self.parallel_config, self.device) eplb_models = 0 - with DeviceMemoryProfiler() as m: - time_before_load = time.perf_counter() - model_loader = get_model_loader(self.load_config) - self.model = model_loader.load_model( - vllm_config=self.vllm_config, model_config=self.model_config - ) - if self.lora_config: - self.model = self.load_lora_model( - self.model, self.vllm_config, self.device + + try: + with DeviceMemoryProfiler() as m: + time_before_load = time.perf_counter() + model_loader = get_model_loader(self.load_config) + self.model = model_loader.load_model( + vllm_config=self.vllm_config, model_config=self.model_config ) - if hasattr(self, "drafter"): - logger.info_once("Loading drafter model...") - self.drafter.load_model(self.model) - if ( - hasattr(self.drafter, "model") - and is_mixture_of_experts(self.drafter.model) - and self.parallel_config.enable_eplb - ): - spec_config = self.vllm_config.speculative_config - assert spec_config is not None - assert spec_config.draft_model_config is not None - logger.info_once( - "EPLB is enabled for drafter model %s.", - spec_config.draft_model_config.model, + if self.lora_config: + self.model = self.load_lora_model( + self.model, self.vllm_config, self.device ) + if hasattr(self, "drafter"): + logger.info_once("Loading drafter model...") + self.drafter.load_model(self.model) + if ( + hasattr(self.drafter, "model") + and is_mixture_of_experts(self.drafter.model) + and self.parallel_config.enable_eplb + ): + spec_config = self.vllm_config.speculative_config + assert spec_config is not None + assert spec_config.draft_model_config is not None + logger.info_once( + "EPLB is enabled for drafter model %s.", + spec_config.draft_model_config.model, + ) - global_expert_load = ( - global_expert_loads[eplb_models] - if global_expert_loads - else None - ) - old_global_expert_indices = ( - old_global_expert_indices_per_model[eplb_models] - if old_global_expert_indices_per_model - else None - ) - if self.eplb_state is None: - self.eplb_state = EplbState(self.parallel_config, self.device) - self.eplb_state.add_model( - self.drafter.model, - spec_config.draft_model_config, - global_expert_load, - old_global_expert_indices, - rank_mapping, - ) - eplb_models += 1 + global_expert_load = ( + global_expert_loads[eplb_models] + if global_expert_loads + else None + ) + old_global_expert_indices = ( + old_global_expert_indices_per_model[eplb_models] + if old_global_expert_indices_per_model + else None + ) + if self.eplb_state is None: + self.eplb_state = EplbState( + self.parallel_config, self.device + ) + self.eplb_state.add_model( + self.drafter.model, + spec_config.draft_model_config, + global_expert_load, + old_global_expert_indices, + rank_mapping, + ) + eplb_models += 1 - if self.use_aux_hidden_state_outputs: - if not supports_eagle3(self.get_model()): - raise RuntimeError( - "Model does not support EAGLE3 interface but " - "aux_hidden_state_outputs was requested" - ) + if self.use_aux_hidden_state_outputs: + if not supports_eagle3(self.get_model()): + raise RuntimeError( + "Model does not support EAGLE3 interface but " + "aux_hidden_state_outputs was requested" + ) - # Try to get auxiliary layers from speculative config, - # otherwise use model's default layers - aux_layers = self._get_eagle3_aux_layers_from_config() - if aux_layers: - logger.info( - "Using auxiliary layers from speculative config: %s", - aux_layers, - ) - else: - aux_layers = self.model.get_eagle3_aux_hidden_state_layers() + # Try to get auxiliary layers from speculative config, + # otherwise use model's default layers + aux_layers = self._get_eagle3_aux_layers_from_config() + if aux_layers: + logger.info( + "Using auxiliary layers from speculative config: %s", + aux_layers, + ) + else: + aux_layers = self.model.get_eagle3_aux_hidden_state_layers() - self.model.set_aux_hidden_state_layers(aux_layers) - time_after_load = time.perf_counter() - self.model_memory_usage = m.consumed_memory + self.model.set_aux_hidden_state_layers(aux_layers) + time_after_load = time.perf_counter() + self.model_memory_usage = m.consumed_memory + except torch.cuda.OutOfMemoryError as e: + msg = ( + "Failed to load model - not enough GPU memory. " + "Try lowering --gpu-memory-utilization to free memory for weights, " + "increasing --tensor-parallel-size, or using --quantization. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more tips." + ) + combined_msg = f"{msg} (original error: {e})" + logger.error(combined_msg) + raise e logger.info_once( "Model loading took %.4f GiB memory and %.6f seconds", self.model_memory_usage / GiB_bytes, @@ -4161,10 +4193,19 @@ class GPUModelRunner( if self.speculative_config and self.speculative_config.use_eagle(): assert isinstance(self.drafter, EagleProposer) + # Eagle currently only supports PIECEWISE cudagraphs. + # Therefore only use cudagraphs if the main model uses PIECEWISE + # NOTE(lucas): this is a hack, need to clean up. use_cudagraphs = ( - cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE) - and not self.speculative_config.enforce_eager - ) + ( + is_graph_capturing + and cudagraph_runtime_mode == CUDAGraphMode.PIECEWISE + ) + or ( + not is_graph_capturing + and cudagraph_runtime_mode != CUDAGraphMode.NONE + ) + ) and not self.speculative_config.enforce_eager # Note(gnovack) - We need to disable cudagraphs for one of the two # lora cases when cudagraph_specialize_lora is enabled. This is a @@ -4696,6 +4737,9 @@ class GPUModelRunner( attention_backend_list, kv_cache_config.kv_cache_groups ) + # Check if attention backend supports PCP&DCP and related features. + check_attention_cp_compatibility(self.vllm_config) + for i, attn_backend_map in enumerate(attention_backend_maps): self.attn_groups.append(create_attn_groups(attn_backend_map, i)) @@ -4855,7 +4899,7 @@ class GPUModelRunner( # we need to adjust the cudagraph sizes to be a multiple of the uniform # decode query length to avoid: https://github.com/vllm-project/vllm/issues/28207 # temp-fix: https://github.com/vllm-project/vllm/issues/28207#issuecomment-3504004536 - # Will be removed in the near future when we have seperate cudagraph capture + # Will be removed in the near future when we have separate cudagraph capture # sizes for decode and mixed prefill-decode. if ( cudagraph_mode.decode_mode() == CUDAGraphMode.FULL @@ -5354,20 +5398,6 @@ class GPUModelRunner( kv_transfer_group.register_kv_caches(kv_caches) kv_transfer_group.set_host_xfer_buffer_ops(copy_kv_blocks) - if self.dcp_world_size > 1: - layer_type = cast(type[Any], AttentionLayerBase) - layers = get_layers_from_vllm_config(self.vllm_config, layer_type) - for layer in layers.values(): - layer_impl = getattr(layer, "impl", None) - if layer_impl is None: - continue - assert layer_impl.need_to_return_lse_for_decode, ( - "DCP requires attention impls to return" - " the softmax lse for decode, but the impl " - f"{layer_impl.__class__.__name__} " - "does not return the softmax lse for decode." - ) - def may_add_encoder_only_layers_to_kv_cache_config(self) -> None: """ Add encoder-only layers to the KV cache config. diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index a46ec2bd118fe..25ac5aaf99818 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -38,7 +38,7 @@ from vllm.model_executor import set_random_seed from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform -from vllm.profiler.gpu_profiler import CudaProfilerWrapper, TorchProfilerWrapper +from vllm.profiler.wrapper import CudaProfilerWrapper, TorchProfilerWrapper from vllm.sequence import IntermediateTensors from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes @@ -79,6 +79,10 @@ class Worker(WorkerBase): is_driver_worker=is_driver_worker, ) + # configure float32 matmul precision according to vLLM env. + precision = envs.VLLM_FLOAT32_MATMUL_PRECISION + torch.backends.cuda.matmul.fp32_precision = precision + if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing from vllm.utils.import_utils import init_cached_hf_modules @@ -88,17 +92,19 @@ class Worker(WorkerBase): # Buffers saved before sleep self._sleep_saved_buffers: dict[str, torch.Tensor] = {} - # Torch/CUDA profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace - # VLLM_TORCH_CUDA_PROFILE=1 + # Torch/CUDA profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" self.profiler = TorchProfilerWrapper( - worker_name=worker_name, local_rank=self.local_rank + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU", "CUDA"], ) - elif envs.VLLM_TORCH_CUDA_PROFILE: - self.profiler = CudaProfilerWrapper() + elif profiler_config.profiler == "cuda": + self.profiler = CudaProfilerWrapper(profiler_config) else: self.profiler = None diff --git a/vllm/v1/worker/kv_connector_model_runner_mixin.py b/vllm/v1/worker/kv_connector_model_runner_mixin.py index b799f1be73d9c..2bcc87b63bcdf 100644 --- a/vllm/v1/worker/kv_connector_model_runner_mixin.py +++ b/vllm/v1/worker/kv_connector_model_runner_mixin.py @@ -22,7 +22,6 @@ from vllm.distributed.kv_transfer import ( has_kv_transfer_group, ) from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBase -from vllm.distributed.kv_transfer.kv_connector.v1.metrics import KVConnectorStats from vllm.forward_context import get_forward_context, set_forward_context from vllm.logger import init_logger from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig @@ -138,16 +137,10 @@ class KVConnectorModelRunnerMixin: ) output.invalid_block_ids = kv_connector.get_block_ids_with_load_errors() - output.kv_connector_stats = ( - KVConnectorModelRunnerMixin.get_kv_connector_stats() - ) - kv_connector.clear_connector_metadata() + output.kv_connector_stats = kv_connector.get_kv_connector_stats() + output.kv_cache_events = kv_connector.get_kv_connector_kv_cache_events() - @staticmethod - def get_kv_connector_stats() -> KVConnectorStats | None: - if has_kv_transfer_group(): - return get_kv_transfer_group().get_kv_connector_stats() - return None + kv_connector.clear_connector_metadata() @staticmethod def use_uniform_kv_cache( diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ce18ca6c37165..5f6136b178b46 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn import vllm.envs as envs -from vllm.config import VllmConfig +from vllm.config import VllmConfig, set_current_vllm_config from vllm.distributed import ( ensure_model_parallel_initialized, init_distributed_environment, @@ -98,10 +98,10 @@ class TPUWorker: # MP runtime is initialized. self.profiler = None self.profile_dir = None - if envs.VLLM_TORCH_PROFILER_DIR and self.rank < 1: + if vllm_config.profiler_config.profiler == "torch" and self.rank < 1: # For TPU, we can only have 1 active profiler session for 1 profiler # server. So we only profile on rank0. - self.profile_dir = envs.VLLM_TORCH_PROFILER_DIR + self.profile_dir = vllm_config.profiler_config.torch_profiler_dir logger.info( "Profiling enabled. Traces will be saved to: %s", self.profile_dir ) @@ -207,7 +207,8 @@ class TPUWorker: # one compiled bytecode. Having one FX graph/cached bytecode per # compiled model is required for `support_torch_compile` decorator to # skip dynamo guard. - self.model_runner.reset_dynamo_cache() + with set_current_vllm_config(self.vllm_config): + self.model_runner.reset_dynamo_cache() # Get the maximum amount of memory used by the model weights and # intermediate activations. diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 0b0e2006d73d2..e9c48223d58b9 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -135,7 +135,7 @@ class AttentionGroup: kv_cache_spec: KVCacheSpec kv_cache_group_id: int # When ubatching is enabled we will have a metadata builder for each ubatch - # so that if they use internal persistant buffers for cudagraphs, and they + # so that if they use internal persistent buffers for cudagraphs, and they # won't have to worry about conflicting with the other ubatches. metadata_builders: list[AttentionMetadataBuilder] = field( default_factory=lambda: [] @@ -313,8 +313,12 @@ def bind_kv_cache( # TODO - analyze where runner_kv_caches is used and the right # way to ensure it properly reflects multiple attention layers # in the same decoder block. - if current_platform.is_cuda_alike() or current_platform.is_xpu(): - # We know that the GPU runner is not impacted by this + if ( + current_platform.is_cuda_alike() + or current_platform.is_xpu() + or current_platform.is_cpu() + ): + # We know that the GPU / CPU runner is not impacted by this # case. Some test code depends on runner_kv_caches, but # not in a way that's impacted by ignoring this. pass diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 267369c730368..1faa1a24ff0ea 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -6,12 +6,12 @@ from typing import Any import torch import torch.distributed -import vllm.envs as envs from vllm.config import VllmConfig from vllm.distributed import get_world_group from vllm.logger import init_logger from vllm.model_executor import set_random_seed from vllm.platforms import current_platform +from vllm.profiler.wrapper import TorchProfilerWrapper from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.xpu_model_runner import XPUModelRunner @@ -36,41 +36,17 @@ class XPUWorker(Worker): assert device_config.device_type == "xpu" assert current_platform.is_xpu() - # Torch profiler. Enabled and configured through env vars: - # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + # Torch profiler. Enabled and configured through profiler_config. self.profiler: Any | None = None - if envs.VLLM_TORCH_PROFILER_DIR: - torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + profiler_config = vllm_config.profiler_config + if profiler_config.profiler == "torch": worker_name = f"{vllm_config.instance_id}-rank-{self.rank}" - logger.info( - "Profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, + self.profiler = TorchProfilerWrapper( + profiler_config, + worker_name=worker_name, + local_rank=self.local_rank, + activities=["CPU", "XPU"], ) - logger.debug( - "Profiler config: record_shapes=%s," - "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - ) - self.profiler = torch.profiler.profile( - activities=[ - torch.profiler.ProfilerActivity.CPU, - torch.profiler.ProfilerActivity.XPU, - ], - record_shapes=envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - profile_memory=envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK, - with_flops=envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - on_trace_ready=torch.profiler.tensorboard_trace_handler( - torch_profiler_trace_dir, - worker_name=worker_name, - use_gzip=envs.VLLM_TORCH_PROFILER_USE_GZIP, - ), - ) - else: - self.profiler = None # we provide this function due to `torch.xpu.mem_get_info()` doesn't # return correct free_gpu_memory on intel client GPU. We need to