forked from pytorch/builder
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_cron.sh
executable file
·179 lines (163 loc) · 5.76 KB
/
build_cron.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/bin/bash
set -ex
echo "build_cron.sh at $(pwd) starting at $(date) on $(uname -a) with pid $$"
SOURCE_DIR=$(cd $(dirname $0) && pwd)
source "${SOURCE_DIR}/nightly_defaults.sh"
# Script hardcoded to the number of worker machines we have.
# Divides work amongst the workers and runs the jobs in parallel on each worker
#
# Command line arguments
# DESIRED_PYTHONS
# All Python versions to build for, separated by commas, in format '2.7mu'
# for manywheels or in format '2.7' for conda/mac-wheels e.g.
# '2.7m,2.7mu,3.5m,3.6m' or '2.7,3.7' . This can also just be the word
# 'all', which will expand to all supported python versions.
#
# DESIRED_CUDAS
# All CUDA versions to build for including 'cpu', separated by commas, in
# format 'cpu' or 'cu92' or 'cu100' etc. e.g. 'cpu,cu92' or 'cu92,cu100'
# . This can also just be the word 'all', which will expand to all
# supported cpu/CUDA versions.
# On mac there is only one machine, so not specifying which machine is fine
if [[ "$(uname)" == 'Darwin' ]]; then
which_worker='mac'
else
if [ "$#" -ne 1 ]; then
echo "Illegal number of parameters. Require which worker I am [0-2] or 'mac'"
echo "e.g. ./build_cron.sh 0"
exit 1
fi
which_worker=$1
# This file is hardcoded to exactly 3 linux workers and 1 mac worker
if [[ "$which_worker" != 0 && "$which_worker" != 1 && "$which_worker" != 2 ]]; then
echo "Illegal parameter. This script is made for exactly 3 workers."
echo "You must give me a worker number out of [0, 1, 2] or 'mac'"
exit 1
fi
fi
# Clear out old logs. When we re-run jobs (after patching a change) then old
# logs should be removed to avoid confusion.
rm -rf "$FAILED_LOG_DIR" || true
rm -rf "$SUCCEEDED_LOG_DIR" || true
mkdir -p "$FAILED_LOG_DIR"
mkdir -p "$SUCCEEDED_LOG_DIR"
log_root="$NIGHTLIES_FOLDER/logs/master"
mkdir -p "$log_root"
# Divy up the tasks
#
# There are currently 41 jobs and 3 machines
# Each machine should run its 12/13 jobs in 5 parallel batches, about
# conda jobs and gpu jobs take longer
#
# The jobs is the combination of all:
# manywheel X [2.7m 2.7mu 3.5m 3.6m 3.7m] X [cpu cu92 cu100]
# conda X [2.7 3.5 3.6 3.7 ] X [cpu cu92 cu100]
# wheel X [2.7 3.5 3.6 3.7 ] X [cpu ]
# libtorch X [2.7m ] X [cpu cu92 cu100] (linux)
# libtorch X [2.7 ] X [cpu ] (mac)
#
# cpu builds ~ 15 minutes. gpu builds > 1 hr
# Try to divide the cpu jobs evenly among the tasks
if [[ "$which_worker" == 0 ]]; then
# manywheel 2.7m,2.7mu,3.5m all
tasks=(
'manywheel 2.7m cpu,cu92'
'manywheel 2.7mu cpu,cu92'
'manywheel 3.5m cpu,cu92'
'manywheel 2.7m,2.7mu,3.5m cu100'
'libtorch 2.7m cpu'
)
elif [[ "$which_worker" == 1 ]]; then
# manywheel 3.6m,3.7, all
# conda 2.7 all
tasks=(
'manywheel 3.6m cpu,cu92'
'manywheel 3.7m cpu,cu92'
'conda 2.7 cpu,cu92'
'manywheel 3.6m,3.7m cu100 -- conda 2.7 cu100'
'libtorch 2.7m cu92'
)
elif [[ "$which_worker" == 2 ]]; then
# conda 3.5,3.6,3.7 all
tasks=(
'conda 3.5 cpu,cu92'
'conda 3.6 cpu,cu92'
'conda 3.7 cpu,cu92'
'conda 3.5,3.6,3.7 cu100'
'libtorch 2.7m cu100'
)
elif [[ "$which_worker" == 'mac' ]]; then
# wheel all
# conda all cpu
tasks=(
'wheel 2.7,3.5,3.6 cpu'
'wheel 3.7 cpu -- conda 2.7 cpu'
'conda 3.5,3.6,3.7 cpu'
'libtorch 2.7 cpu'
)
fi
# Run the tasks
child_pids=()
for task in "${tasks[@]}"; do
log_file="$log_root/$(echo $task | tr ' ' '_' | tr -d ',-').log"
"${NIGHTLIES_BUILDER_ROOT}/cron/build_multiple.sh" $task > "$log_file" 2>&1 &
child_pid="$!"
echo "Starting [build_multiple.sh $task] at $(date) with pid $child_pid"
child_pids+=("$child_pid")
done
# We would like to always upload and delete old build folders
set +e
first_ret=0
# Wait for all the jobs to finish
echo "Waiting for all jobs to finish at $(date)"
for child_pid in "${child_pids[@]}"; do
wait "$child_pid"
done
echo "All jobs finished! at $(date)"
# Count the total number of failures
failed_jobs=($(ls $FAILED_LOG_DIR))
echo "Detected ${#failed_jobs[@]} failed builds"
# Email everyone if the jobs failed
if [[ "${#failed_jobs[@]}" != 0 ]]; then
echo "Emailing all of $NIGHTLIES_EMAIL_LIST"
if [[ "(uname)" == 'Darwin' ]]; then
# `mail` on mac doesn't expect the -t
dash_t=''
else
dash_t='-t'
fi
mail -s "$NIGHTLIES_DATE nightlies failed" $dash_t "$NIGHTLIES_EMAIL_LIST" <<< \
"On $(uname -a)
On $(date)
Nightly jobs failed. Failed jobs are: ${failed_jobs[@]}"
ret="$?"
if [[ "$first_ret" == 0 ]]; then
first_ret="$ret"
fi
fi
# Upload the working binaries and all of the logs
# Only upload automatically on the current day, not on manual re-runs of past
# days
if [[ "$NIGHTLIES_DATE" == "$(date +%Y_%m_%d)" ]]; then
# Upload successful binaries and all of the logs
succeeded_jobs=($(ls $SUCCEEDED_LOG_DIR))
echo "Uploading all of these succesful jobs\n: ${succeeded_jobs[@]}"
"${NIGHTLIES_BUILDER_ROOT}/cron/upload.sh" ${succeeded_jobs[@]} > "${log_root}/upload.log" 2>&1
ret="$?"
if [[ "$ret" != 0 && "$first_ret" == 0 ]]; then
echo "FAILED upload.sh"
first_ret="$ret"
fi
fi
# Regardless of failures, clean up the old build folders so that we don't run
# out of memory
# Only run the clean on the current day, not on manual re-runs of past days
if [[ "$(basename $NIGHTLIES_FOLDER)" == "$NIGHTLIES_DATE" ]]; then
"${NIGHTLIES_BUILDER_ROOT}/cron/clean.sh" > "${log_root}/clean.sh" 2>&1
ret="$?"
if [[ "$ret" != 0 && "$first_ret" == 0 ]]; then
echo "FAILED clean.sh"
first_ret="$ret"
fi
fi
exit "$first_ret"