mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 23:45:54 +08:00
30 lines
1.1 KiB
Bash
30 lines
1.1 KiB
Bash
#!/bin/bash
|
|
|
|
# This script build the GH200 docker image and run the offline inference inside the container.
|
|
# It serves a sanity check for compilation and basic model usage.
|
|
set -ex
|
|
|
|
# Skip the new torch installation during build since we are using the specified version for arm64 in the Dockerfile
|
|
python3 use_existing_torch.py
|
|
|
|
# Try building the docker image
|
|
DOCKER_BUILDKIT=1 docker build . \
|
|
--file docker/Dockerfile \
|
|
--target vllm-openai \
|
|
--platform "linux/arm64" \
|
|
-t gh200-test \
|
|
--build-arg max_jobs=66 \
|
|
--build-arg nvcc_threads=2 \
|
|
--build-arg RUN_WHEEL_CHECK=false \
|
|
--build-arg torch_cuda_arch_list="9.0+PTX"
|
|
|
|
# Setup cleanup
|
|
remove_docker_container() { docker rm -f gh200-test || true; }
|
|
trap remove_docker_container EXIT
|
|
remove_docker_container
|
|
|
|
# Run the image and test offline inference
|
|
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
|
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
|
'
|