diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh deleted file mode 100644 index d7167161b0059..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8030} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="deepseek-ai/DeepSeek-V2-lite" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 2 \ - --data-parallel-size 2 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 693418da6093e..8106f50f18f66 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do --data-parallel-size 2 \ --enable-expert-parallel \ --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600}' \ --trust-remote-code \ --max-model-len 2048 \ --port $PORT & diff --git a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh b/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh deleted file mode 100644 index 937a43d1a3221..0000000000000 --- a/.buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/usr/bin/env bash -set -euxo pipefail - -# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT] -THRESHOLD=${1:-0.25} -NUM_Q=${2:-1319} -PORT=${3:-8040} -OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled} -mkdir -p "${OUT_DIR}" - -wait_for_server() { - local port=$1 - timeout 600 bash -c ' - until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do - sleep 1 - done' -} - -MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct" - -# Set BACKENDS based on platform -if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then - # ROCm platform - BACKENDS=("allgather_reducescatter") - # Disable MOE padding for ROCm since it is causing eplb to fail - export VLLM_ROCM_MOE_PADDING=0 -else - # Non-ROCm platform (CUDA/other) - BACKENDS=("deepep_high_throughput" "deepep_low_latency") -fi - -cleanup() { - if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then - kill "${SERVER_PID}" 2>/dev/null || true - for _ in {1..20}; do - kill -0 "${SERVER_PID}" 2>/dev/null || break - sleep 0.5 - done - kill -9 "${SERVER_PID}" 2>/dev/null || true - fi -} -trap cleanup EXIT - -for BACK in "${BACKENDS[@]}"; do - VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ - vllm serve "$MODEL" \ - --enforce-eager \ - --tensor-parallel-size 4 \ - --enable-expert-parallel \ - --enable-eplb \ - --eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \ - --speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \ - --trust-remote-code \ - --max-model-len 2048 \ - --gpu-memory-utilization 0.9 \ - --port $PORT & - SERVER_PID=$! - wait_for_server $PORT - - TAG=$(echo "$MODEL" | tr '/: \\n' '_____') - OUT="${OUT_DIR}/${TAG}_${BACK}.json" - python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT} - python3 - <= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}" -PY - - cleanup - SERVER_PID= - sleep 1 - PORT=$((PORT+1)) -done diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8fc3587f7813c..750e7c038351c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -1379,22 +1379,4 @@ steps: num_gpus: 2 working_dir: "/vllm-workspace" commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 - -- label: DeepSeek V2-Lite Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030 - -- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy - timeout_in_minutes: 60 - gpu: h100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace" - commands: - - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040 + - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 \ No newline at end of file diff --git a/vllm/distributed/eplb/rebalance_execute.py b/vllm/distributed/eplb/rebalance_execute.py index 55856d940f001..376dad8a72ef1 100644 --- a/vllm/distributed/eplb/rebalance_execute.py +++ b/vllm/distributed/eplb/rebalance_execute.py @@ -322,6 +322,9 @@ async def transfer_layer( num_local_physical_experts = next(iter(expert_weights[0])).shape[0] assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) assert num_physical_experts == ep_size * num_local_physical_experts + # A buffer to hold the expert weights in one layer during the exchange. + # NOTE: Currently we assume the same weights across different layers + # have the same shape. is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer( num_local_experts=num_local_physical_experts,