mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 03:05:02 +08:00
Enabling cooperative multi-gpu tests on multi-gpu nodes (#27986)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
This commit is contained in:
parent
e50c454672
commit
80c9275348
@ -173,6 +173,14 @@ fi
|
||||
PARALLEL_JOB_COUNT=8
|
||||
MYPYTHONPATH=".."
|
||||
|
||||
# Test that we're launching on the machine that has
|
||||
# proper access to GPUs
|
||||
render_gid=$(getent group render | cut -d: -f3)
|
||||
if [[ -z "$render_gid" ]]; then
|
||||
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||
if [[ $commands == *"--shard-id="* ]]; then
|
||||
# assign job count as the number of shards used
|
||||
@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||
-e HF_TOKEN \
|
||||
@ -217,8 +226,8 @@ else
|
||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||
--network=host \
|
||||
--shm-size=16gb \
|
||||
--group-add "$render_gid" \
|
||||
--rm \
|
||||
-e HIP_VISIBLE_DEVICES=0 \
|
||||
-e HF_TOKEN \
|
||||
-e AWS_ACCESS_KEY_ID \
|
||||
-e AWS_SECRET_ACCESS_KEY \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user