mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 08:35:48 +08:00
Enabling cooperative multi-gpu tests on multi-gpu nodes (#27986)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
This commit is contained in:
parent
e50c454672
commit
80c9275348
@ -173,6 +173,14 @@ fi
|
|||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
MYPYTHONPATH=".."
|
MYPYTHONPATH=".."
|
||||||
|
|
||||||
|
# Test that we're launching on the machine that has
|
||||||
|
# proper access to GPUs
|
||||||
|
render_gid=$(getent group render | cut -d: -f3)
|
||||||
|
if [[ -z "$render_gid" ]]; then
|
||||||
|
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
# assign job count as the number of shards used
|
# assign job count as the number of shards used
|
||||||
@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
|
--group-add "$render_gid" \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
@ -217,8 +226,8 @@ else
|
|||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
|
--group-add "$render_gid" \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
-e AWS_ACCESS_KEY_ID \
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user