Signed-off-by: Robert Shaw <robshaw@redhat.com>
This commit is contained in:
Robert Shaw 2025-07-12 21:57:16 +00:00
parent 550f8a052c
commit 13729ad0af
2 changed files with 66 additions and 5 deletions

View File

@ -1,12 +1,20 @@
ARG CUDA_VERSION=12.8.1
from nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
RUN wget -qO- https://astral.sh/uv/install.sh | sh
RUN apt update && apt install git -y && apt install curl -y
WORKDIR /workspace
RUN git clone https://github.com/vllm-project/vllm.git && \
VLLM_USE_PRECOMPILED=1 uv pip install -e .
RUN git clone https://github.com/vllm-project/vllm.git
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Install vllm.
WORKDIR /workspace/vllm
ENV VLLM_SHA=8ce3cad72fbd0dc6524e495ecddbbc58fd8fd09e
RUN uv venv .vllm --python 3.12
RUN . .vllm/bin/activate && VLLM_USE_PRECOMPILED=1 uv pip install -e .
# Checkout a specific commit.
ENV VLLM_SHA=550f8a052cae03c7e14a46767f689ab09c1cc28d
RUN git fetch && git checkout ${VLLM_SHA}
ENTRYPOINT ["/bin/bash"]

View File

@ -0,0 +1,53 @@
llama-scout-bf16:
python3 benchmark_moe.py \
--model meta-llama/Llama-4-Scout-17B-16E-Instruct \
--tp-size 1 \
--ep-size 8 \
--tune
llama-scout-fp8:
python3 benchmark_moe.py \
--model meta-llama/Llama-4-Scout-17B-16E-Instruct \
--tp-size 1 \
--ep-size 8 \
--dtype fp8_w8a8 \
--tune
llama-maverick:
python3 benchmark_moe.py \
--model meta-llama/Llama-4-Maverick-17B-128E-Instruct \
--tp-size 1 \
--ep-size 8 \
--dtype fp8_w8a8 \
--tune
qwen-30b:
python3 benchmark_moe.py \
--model Qwen/Qwen3-30B-A3B \
--tp-size 1 \
--ep-size 8 \
--tune
qwen-30b-fp8:
python3 benchmark_moe.py \
--model Qwen/Qwen3-30B-A3B-FP8 \
--tp-size 1 \
--ep-size 8 \
--dtype fp8_w8a8 \
--tune
qwen-235b:
python3 benchmark_moe.py \
--model Qwen/Qwen3-235B-A22B \
--tp-size 1 \
--ep-size 8 \
--dtype fp8_w8a8 \
--tune
deepseek-r1:
python3 benchmark_moe.py \
--model deepseek-ai/DeepSeek-R1-0528 \
--tp-size 1 \
--ep-size 8 \
--dtype fp8_w8a8 \
--tune