Enabling cooperative multi-gpu tests on multi-gpu nodes (#27986)

Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
This commit is contained in:
Alexei-V-Ivanov-AMD 2025-11-05 09:35:49 -06:00 committed by GitHub
parent e50c454672
commit 80c9275348
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -173,6 +173,14 @@ fi
PARALLEL_JOB_COUNT=8
MYPYTHONPATH=".."
# Test that we're launching on the machine that has
# proper access to GPUs
render_gid=$(getent group render | cut -d: -f3)
if [[ -z "$render_gid" ]]; then
echo "Error: 'render' group not found. This is required for GPU access." >&2
exit 1
fi
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
@ -217,8 +226,8 @@ else
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
--network=host \
--shm-size=16gb \
--group-add "$render_gid" \
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \