mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 21:15:02 +08:00
[ROCm][CI] Add "Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy Test" Back Into AMD CI (#30590)
Signed-off-by: David Chen <530634352@qq.com> Signed-off-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Signed-off-by: Micah Williamson <micah.williamson@amd.com> Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
This commit is contained in:
parent
29f7d97715
commit
97f2f160fd
@ -0,0 +1,74 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8040}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||||
|
|
||||||
|
# Set BACKENDS based on platform
|
||||||
|
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||||
|
# ROCm platform
|
||||||
|
BACKENDS=("allgather_reducescatter")
|
||||||
|
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||||
|
export VLLM_ROCM_MOE_PADDING=0
|
||||||
|
else
|
||||||
|
# Non-ROCm platform (CUDA/other)
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 4 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||||
|
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--gpu-memory-utilization 0.9 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@ -1629,7 +1629,6 @@ steps:
|
|||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
agent_pool: mi325_4
|
agent_pool: mi325_4
|
||||||
# grade: Blocking
|
# grade: Blocking
|
||||||
gpu: h100
|
|
||||||
optional: true
|
optional: true
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
working_dir: "/vllm-workspace"
|
working_dir: "/vllm-workspace"
|
||||||
|
|||||||
@ -322,9 +322,6 @@ async def transfer_layer(
|
|||||||
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
|
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
|
||||||
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
|
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
|
||||||
assert num_physical_experts == ep_size * num_local_physical_experts
|
assert num_physical_experts == ep_size * num_local_physical_experts
|
||||||
# A buffer to hold the expert weights in one layer during the exchange.
|
|
||||||
# NOTE: Currently we assume the same weights across different layers
|
|
||||||
# have the same shape.
|
|
||||||
|
|
||||||
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
|
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
|
||||||
num_local_experts=num_local_physical_experts,
|
num_local_experts=num_local_physical_experts,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user