mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-25 10:16:32 +08:00
parent
25221b44bb
commit
b4054c8ab4
@ -1,73 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8030}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}_async_eplb.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@ -50,7 +50,6 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--data-parallel-size 2 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--port $PORT &
|
||||
|
||||
@ -1,74 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||
THRESHOLD=${1:-0.25}
|
||||
NUM_Q=${2:-1319}
|
||||
PORT=${3:-8040}
|
||||
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||
mkdir -p "${OUT_DIR}"
|
||||
|
||||
wait_for_server() {
|
||||
local port=$1
|
||||
timeout 600 bash -c '
|
||||
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||
sleep 1
|
||||
done'
|
||||
}
|
||||
|
||||
MODEL="Qwen/Qwen3-Next-80B-A3B-Instruct"
|
||||
|
||||
# Set BACKENDS based on platform
|
||||
if command -v rocm-smi &> /dev/null || [[ -d /opt/rocm ]] || [[ -n "${ROCM_PATH:-}" ]]; then
|
||||
# ROCm platform
|
||||
BACKENDS=("allgather_reducescatter")
|
||||
# Disable MOE padding for ROCm since it is causing eplb to fail
|
||||
export VLLM_ROCM_MOE_PADDING=0
|
||||
else
|
||||
# Non-ROCm platform (CUDA/other)
|
||||
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||
fi
|
||||
|
||||
cleanup() {
|
||||
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" 2>/dev/null || true
|
||||
for _ in {1..20}; do
|
||||
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||
sleep 0.5
|
||||
done
|
||||
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 4 \
|
||||
--enable-expert-parallel \
|
||||
--enable-eplb \
|
||||
--eplb-config '{"window_size":200,"step_interval":600,"use_async":true}' \
|
||||
--speculative-config '{"method":"qwen3_next_mtp","num_speculative_tokens":1}' \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||
python3 - <<PY
|
||||
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||
PY
|
||||
|
||||
cleanup
|
||||
SERVER_PID=
|
||||
sleep 1
|
||||
PORT=$((PORT+1))
|
||||
done
|
||||
@ -1379,22 +1379,4 @@ steps:
|
||||
num_gpus: 2
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
|
||||
- label: DeepSeek V2-Lite Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_async_eplb.sh 0.25 1319 8030
|
||||
|
||||
- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy
|
||||
timeout_in_minutes: 60
|
||||
gpu: h100
|
||||
optional: true
|
||||
num_gpus: 4
|
||||
working_dir: "/vllm-workspace"
|
||||
commands:
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040
|
||||
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1
|
||||
@ -322,6 +322,9 @@ async def transfer_layer(
|
||||
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
|
||||
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
|
||||
assert num_physical_experts == ep_size * num_local_physical_experts
|
||||
# A buffer to hold the expert weights in one layer during the exchange.
|
||||
# NOTE: Currently we assume the same weights across different layers
|
||||
# have the same shape.
|
||||
|
||||
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
|
||||
num_local_experts=num_local_physical_experts,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user