Skip to content

Commit

Permalink
Add support for multi-node on CI (#5955)
Browse files Browse the repository at this point in the history
Signed-off-by: kevin <kevin@anyscale.com>
  • Loading branch information
khluu authored Jul 9, 2024
1 parent 08c5bde commit a0550cb
Showing 1 changed file with 77 additions and 0 deletions.
77 changes: 77 additions & 0 deletions .buildkite/run-multi-node-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/bin/bash

set -euox pipefail

if [[ $# -lt 3 ]]; then
echo "Please provide the number of nodes and GPU per node."
exit 1
fi

NUM_NODES=$1
NUM_GPUS=$2
DOCKER_IMAGE=$3

shift 3
COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}"
exit 1
fi

echo "List of commands"
for command in "${COMMANDS[@]}"; do
echo $command
done

start_network() {
docker network create --subnet=192.168.10.0/24 docker-net
}

start_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
# echo "Starting node$node with GPU devices: $GPU_DEVICES"
docker run -d --gpus "$GPU_DEVICES" --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE tail -f /dev/null
done
}

run_nodes() {
for node in $(seq 0 $(($NUM_NODES-1))); do
GPU_DEVICES='"device='
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=','
fi
done
GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -lt $(($NUM_NODES - 1)) ]; then
docker exec -d node$node /bin/bash -c "${COMMANDS[$node]}"
else
docker exec node$node /bin/bash -c "${COMMANDS[$node]}"
fi
done
}
cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do
docker stop node$node
done
docker network rm docker-net
}
trap cleanup EXIT
start_network
start_nodes
run_nodes

0 comments on commit a0550cb

Please sign in to comment.