mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-26 09:24:25 +08:00
[CI] Add Async Eplb nightly CI tests (#29385)
Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
parent
3f42b05fbc
commit
7fe9c1a223
@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8030}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||||
|
|
||||||
|
# Set BACKENDS based on platform
|
||||||
|
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||||
|
# ROCm platform
|
||||||
|
BACKENDS=("allgather_reducescatter")
|
||||||
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
|
else
|
||||||
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@ -50,6 +50,7 @@ for BACK in "${BACKENDS[@]}"; do
|
|||||||
--data-parallel-size 2 \
|
--data-parallel-size 2 \
|
||||||
--enable-expert-parallel \
|
--enable-expert-parallel \
|
||||||
--enable-eplb \
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":200,"step_interval":600}' \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--max-model-len 2048 \
|
--max-model-len 2048 \
|
||||||
--port $PORT &
|
--port $PORT &
|
||||||
|
|||||||
@ -0,0 +1,74 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8040}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||||
|
|
||||||
|
# Set BACKENDS based on platform
|
||||||
|
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||||
|
# ROCm platform
|
||||||
|
BACKENDS=("allgather_reducescatter")
|
||||||
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
|
else
|
||||||
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 4 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||||
|
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--gpu-memory-utilization 0.9 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@ -1373,4 +1373,22 @@ steps:
|
|||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||||
|
|
||||||
|
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||||
|
|||||||
@ -322,9 +322,6 @@ async def transfer_layer(
|
|||||||
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
|
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
|
||||||
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
|
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
|
||||||
assert num_physical_experts == ep_size * num_local_physical_experts
|
assert num_physical_experts == ep_size * num_local_physical_experts
|
||||||
# A buffer to hold the expert weights in one layer during the exchange.
|
|
||||||
# NOTE: Currently we assume the same weights across different layers
|
|
||||||
# have the same shape.
|
|
||||||
|
|
||||||
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
|
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
|
||||||
num_local_experts=num_local_physical_experts,
|
num_local_experts=num_local_physical_experts,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user