From 80c9275348c1c0be21f03938c918d2ed5d98574b Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Wed, 5 Nov 2025 09:35:49 -0600 Subject: [PATCH] Enabling cooperative multi-gpu tests on multi-gpu nodes (#27986) Signed-off-by: Alexei V. Ivanov --- .buildkite/scripts/hardware_ci/run-amd-test.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index aa4cc7b35a543..58fd435691f4a 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -173,6 +173,14 @@ fi PARALLEL_JOB_COUNT=8 MYPYTHONPATH=".." +# Test that we're launching on the machine that has +# proper access to GPUs +render_gid=$(getent group render | cut -d: -f3) +if [[ -z "$render_gid" ]]; then + echo "Error: 'render' group not found. This is required for GPU access." >&2 + exit 1 +fi + # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. if [[ $commands == *"--shard-id="* ]]; then # assign job count as the number of shards used @@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --network=host \ --shm-size=16gb \ + --group-add "$render_gid" \ --rm \ -e HIP_VISIBLE_DEVICES="${GPU}" \ -e HF_TOKEN \ @@ -217,8 +226,8 @@ else --device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \ --network=host \ --shm-size=16gb \ + --group-add "$render_gid" \ --rm \ - -e HIP_VISIBLE_DEVICES=0 \ -e HF_TOKEN \ -e AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY \