forked from solana-labs/solana
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgce.sh
executable file
·408 lines (333 loc) · 10.5 KB
/
gce.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
#!/bin/bash -e
here=$(dirname "$0")
# shellcheck source=net/common.sh
source "$here"/common.sh
cloudProvider=$(basename "$0" .sh)
bootDiskType=""
case $cloudProvider in
gce)
# shellcheck source=net/scripts/gce-provider.sh
source "$here"/scripts/gce-provider.sh
imageName="ubuntu-16-04-cuda-9-2-new"
cpuLeaderMachineType=n1-standard-16
gpuLeaderMachineType="$cpuLeaderMachineType --accelerator count=4,type=nvidia-tesla-k80"
leaderMachineType=$cpuLeaderMachineType
validatorMachineType=n1-standard-4
clientMachineType=n1-standard-16
;;
ec2)
# shellcheck source=net/scripts/ec2-provider.sh
source "$here"/scripts/ec2-provider.sh
imageName="ami-0466e26ccc0e752c1"
cpuLeaderMachineType=m4.4xlarge
gpuLeaderMachineType=p2.xlarge
leaderMachineType=$cpuLeaderMachineType
validatorMachineType=m4.xlarge
clientMachineType=m4.4xlarge
;;
*)
echo "Error: Unknown cloud provider: $cloudProvider"
;;
esac
prefix=testnet-dev-${USER//[^A-Za-z0-9]/}
validatorNodeCount=5
clientNodeCount=1
leaderBootDiskSizeInGb=1000
validatorBootDiskSizeInGb=$leaderBootDiskSizeInGb
clientBootDiskSizeInGb=75
publicNetwork=false
enableGpu=false
leaderAddress=
usage() {
exitcode=0
if [[ -n "$1" ]]; then
exitcode=1
echo "Error: $*"
fi
cat <<EOF
usage: $0 [create|config|delete] [common options] [command-specific options]
Manage testnet instances
create - create a new testnet (implies 'config')
config - configure the testnet and write a config file describing it
delete - delete the testnet
common options:
-p [prefix] - Optional common prefix for instance names to avoid
collisions (default: $prefix)
create-specific options:
-n [number] - Number of validator nodes (default: $validatorNodeCount)
-c [number] - Number of client nodes (default: $clientNodeCount)
-P - Use public network IP addresses (default: $publicNetwork)
-z [zone] - Zone for the nodes (default: $zone)
-g - Enable GPU (default: $enableGpu)
-a [address] - Set the leader node's external IP address to this value.
For GCE, [address] is the "name" of the desired External
IP Address.
For EC2, [address] is the "allocation ID" of the desired
Elastic IP.
-d [disk-type] - Specify a boot disk type (default None) Use pd-ssd to get ssd on GCE.
config-specific options:
none
delete-specific options:
none
EOF
exit $exitcode
}
command=$1
[[ -n $command ]] || usage
shift
[[ $command = create || $command = config || $command = delete ]] || usage "Invalid command: $command"
while getopts "h?p:Pn:c:z:ga:d:" opt; do
case $opt in
h | \?)
usage
;;
p)
[[ ${OPTARG//[^A-Za-z0-9-]/} == "$OPTARG" ]] || usage "Invalid prefix: \"$OPTARG\", alphanumeric only"
prefix=$OPTARG
;;
P)
publicNetwork=true
;;
n)
validatorNodeCount=$OPTARG
;;
c)
clientNodeCount=$OPTARG
;;
z)
cloud_SetZone "$OPTARG"
;;
g)
enableGpu=true
leaderMachineType="$gpuLeaderMachineType"
;;
a)
leaderAddress=$OPTARG
;;
d)
bootDiskType=$OPTARG
;;
*)
usage "Error: unhandled option: $opt"
;;
esac
done
shift $((OPTIND - 1))
[[ -z $1 ]] || usage "Unexpected argument: $1"
sshPrivateKey="$netConfigDir/id_$prefix"
# cloud_ForEachInstance [cmd] [extra args to cmd]
#
# Execute a command for each element in the `instances` array
#
# cmd - The command to execute on each instance
# The command will receive arguments followed by any
# additionl arguments supplied to cloud_ForEachInstance:
# name - name of the instance
# publicIp - The public IP address of this instance
# privateIp - The priate IP address of this instance
# count - Monotonically increasing count for each
# invocation of cmd, starting at 1
# ... - Extra args to cmd..
#
#
cloud_ForEachInstance() {
declare cmd="$1"
shift
[[ -n $cmd ]] || { echo cloud_ForEachInstance: cmd not specified; exit 1; }
declare count=1
for info in "${instances[@]}"; do
declare name publicIp privateIp
IFS=: read -r name publicIp privateIp < <(echo "$info")
eval "$cmd" "$name" "$publicIp" "$privateIp" "$count" "$@"
count=$((count + 1))
done
}
prepareInstancesAndWriteConfigFile() {
$metricsWriteDatapoint "testnet-deploy net-config-begin=1"
cat >> "$configFile" <<EOF
# autogenerated at $(date)
netBasename=$prefix
publicNetwork=$publicNetwork
sshPrivateKey=$sshPrivateKey
EOF
buildSshOptions
recordInstanceIp() {
declare name="$1"
declare publicIp="$2"
declare privateIp="$3"
declare arrayName="$5"
echo "$arrayName+=($publicIp) # $name" >> "$configFile"
if [[ $arrayName = "leaderIp" ]]; then
if $publicNetwork; then
echo "entrypointIp=$publicIp" >> "$configFile"
else
echo "entrypointIp=$privateIp" >> "$configFile"
fi
fi
}
waitForStartupComplete() {
declare name="$1"
declare publicIp="$2"
echo "Waiting for $name to finish booting..."
(
for i in $(seq 1 30); do
if (set -x; ssh "${sshOptions[@]}" "$publicIp" "test -f /.instance-startup-complete"); then
break
fi
sleep 2
echo "Retry $i..."
done
)
echo "$name has booted."
}
echo "Looking for leader instance..."
cloud_FindInstance "$prefix-leader"
[[ ${#instances[@]} -eq 1 ]] || {
echo "Unable to find leader"
exit 1
}
(
declare leaderName
declare leaderIp
IFS=: read -r leaderName leaderIp _ < <(echo "${instances[0]}")
# Try to ping the machine first.
timeout 60s bash -c "set -o pipefail; until ping -c 3 $leaderIp | tr - _; do echo .; done"
if [[ ! -r $sshPrivateKey ]]; then
echo "Fetching $sshPrivateKey from $leaderName"
# Try to scp in a couple times, sshd may not yet be up even though the
# machine can be pinged...
set -x -o pipefail
for i in $(seq 1 30); do
if cloud_FetchFile "$leaderName" "$leaderIp" /solana-id_ecdsa "$sshPrivateKey"; then
break
fi
sleep 1
echo "Retry $i..."
done
chmod 400 "$sshPrivateKey"
ls -l "$sshPrivateKey"
fi
)
echo "leaderIp=()" >> "$configFile"
cloud_ForEachInstance recordInstanceIp leaderIp
cloud_ForEachInstance waitForStartupComplete
echo "Looking for validator instances..."
cloud_FindInstances "$prefix-validator"
[[ ${#instances[@]} -gt 0 ]] || {
echo "Unable to find validators"
exit 1
}
echo "validatorIpList=()" >> "$configFile"
cloud_ForEachInstance recordInstanceIp validatorIpList
cloud_ForEachInstance waitForStartupComplete
echo "clientIpList=()" >> "$configFile"
echo "Looking for client instances..."
cloud_FindInstances "$prefix-client"
[[ ${#instances[@]} -eq 0 ]] || {
cloud_ForEachInstance recordInstanceIp clientIpList
cloud_ForEachInstance waitForStartupComplete
}
echo "Wrote $configFile"
$metricsWriteDatapoint "testnet-deploy net-config-complete=1"
}
delete() {
$metricsWriteDatapoint "testnet-deploy net-delete-begin=1"
# Delete the leader node first to prevent unusual metrics on the dashboard
# during shutdown.
# TODO: It would be better to fully cut-off metrics reporting before any
# instances are deleted.
for filter in "$prefix-leader" "$prefix-"; do
echo "Searching for instances: $filter"
cloud_FindInstances "$filter"
if [[ ${#instances[@]} -eq 0 ]]; then
echo "No instances found matching '$filter'"
else
cloud_DeleteInstances true
fi
done
rm -f "$configFile"
$metricsWriteDatapoint "testnet-deploy net-delete-complete=1"
}
case $command in
delete)
delete
;;
create)
[[ -n $validatorNodeCount ]] || usage "Need number of nodes"
if [[ $validatorNodeCount -le 0 ]]; then
usage "One or more validator nodes is required"
fi
delete
$metricsWriteDatapoint "testnet-deploy net-create-begin=1"
rm -rf "$sshPrivateKey"{,.pub}
# Note: using rsa because |aws ec2 import-key-pair| seems to fail for ecdsa
ssh-keygen -t rsa -N '' -f "$sshPrivateKey"
printNetworkInfo() {
cat <<EOF
========================================================================================
Network composition:
Leader = $leaderMachineType (GPU=$enableGpu)
Validators = $validatorNodeCount x $validatorMachineType
Client(s) = $clientNodeCount x $clientMachineType
========================================================================================
EOF
}
printNetworkInfo
declare startupScript="$netConfigDir"/instance-startup-script.sh
cat > "$startupScript" <<EOF
#!/bin/bash -ex
# autogenerated at $(date)
cat > /etc/motd <<EOM
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
This instance has not been fully configured.
See startup script log messages in /var/log/syslog for status:
$ sudo cat /var/log/syslog | egrep \\(startup-script\\|cloud-init\)
To block until setup is complete, run:
$ until [[ -f /.instance-startup-complete ]]; do sleep 1; done
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
EOM
# Place the generated private key at /solana-id_ecdsa so it's retrievable by anybody
# who is able to log into this machine
cat > /solana-id_ecdsa <<EOK
$(cat "$sshPrivateKey")
EOK
cat > /solana-id_ecdsa.pub <<EOK
$(cat "$sshPrivateKey.pub")
EOK
chmod 444 /solana-id_ecdsa
USER=\$(id -un)
$(
cd "$here"/scripts/
cat \
disable-background-upgrades.sh \
create-solana-user.sh \
add-solana-user-authorized_keys.sh \
install-earlyoom.sh \
install-libssl-compatability.sh \
install-rsync.sh \
)
cat > /etc/motd <<EOM
$(printNetworkInfo)
EOM
touch /.instance-startup-complete
EOF
cloud_CreateInstances "$prefix" "$prefix-leader" 1 \
"$imageName" "$leaderMachineType" "$leaderBootDiskSizeInGb" \
"$startupScript" "$leaderAddress" "$bootDiskType"
cloud_CreateInstances "$prefix" "$prefix-validator" "$validatorNodeCount" \
"$imageName" "$validatorMachineType" "$validatorBootDiskSizeInGb" \
"$startupScript" "" "$bootDiskType"
if [[ $clientNodeCount -gt 0 ]]; then
cloud_CreateInstances "$prefix" "$prefix-client" "$clientNodeCount" \
"$imageName" "$clientMachineType" "$clientBootDiskSizeInGb" \
"$startupScript" "" "$bootDiskType"
fi
$metricsWriteDatapoint "testnet-deploy net-create-complete=1"
prepareInstancesAndWriteConfigFile
;;
config)
prepareInstancesAndWriteConfigFile
;;
*)
usage "Unknown command: $command"
esac