-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcheck-parallel
executable file
·205 lines (167 loc) · 5.63 KB
/
check-parallel
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# Copyright (c) 2024 Red Hat, Inc. All Rights Reserved.
#
# Run all tests in parallel
#
# This is a massive resource bomb script. For every test, it creates a
# pair of sparse loop devices for test and scratch devices, then mount points
# for them and runs the test in the background. When it completes, it tears down
# the loop devices.
export SRC_DIR="tests"
basedir=$1
shift
check_args="$*"
runners=64
runner_list=()
runtimes=()
# tests in auto group
test_list=$(awk '/^[0-9].*auto/ { print "generic/" $1 }' tests/generic/group.list)
test_list+=$(awk '/^[0-9].*auto/ { print "xfs/" $1 }' tests/xfs/group.list)
# grab all previously run tests and order them from highest runtime to lowest
# We are going to try to run the longer tests first, hopefully so we can avoid
# massive thundering herds trying to run lots of really short tests in parallel
# right off the bat. This will also tend to vary the order of tests from run to
# run somewhat.
#
# If we have tests in the test list that don't have runtimes recorded, then
# append them to be run last.
build_runner_list()
{
local runtimes
local run_list=()
local prev_results=`ls -tr $basedir/runner-0/ | grep results | tail -1`
runtimes=$(cat $basedir/*/$prev_results/check.time | sort -k 2 -nr | cut -d " " -f 1)
# Iterate the timed list first. For every timed list entry that
# is found in the test_list, add it to the local runner list.
local -a _list=( $runtimes )
local -a _tlist=( $test_list )
local rx=0
local ix
local jx
#set -x
for ((ix = 0; ix < ${#_list[*]}; ix++)); do
echo $test_list | grep -q ${_list[$ix]}
if [ $? == 0 ]; then
# add the test to the new run list and remove
# it from the remaining test list.
run_list[rx++]=${_list[$ix]}
_tlist=( ${_tlist[*]/${_list[$ix]}/} )
fi
done
# The final test list is all the time ordered tests followed by
# all the tests we didn't find time records for.
test_list="${run_list[*]} ${_tlist[*]}"
}
if [ -f $basedir/runner-0/results/check.time ]; then
build_runner_list
fi
# split the list amongst N runners
split_runner_list()
{
local ix
local rx
local -a _list=( $test_list )
for ((ix = 0; ix < ${#_list[*]}; ix++)); do
seq="${_list[$ix]}"
rx=$((ix % $runners))
runner_list[$rx]+="${_list[$ix]} "
#echo $seq
done
}
_create_loop_device()
{
local file=$1 dev
dev=`losetup -f --show $file` || _fail "Cannot assign $file to a loop device"
# Using buffered IO for the loop devices seems to run quite a bit
# faster. There are a lot of tests that hit the same regions of the
# filesystems, so avoiding read IO seems to really help. Results can
# vary, though, because many tests drop all caches unconditionally.
# Uncomment to use AIO+DIO loop devices instead.
#test -b "$dev" && losetup --direct-io=on $dev 2> /dev/null
echo $dev
}
_destroy_loop_device()
{
local dev=$1
blockdev --flushbufs $dev
umount $dev > /dev/null 2>&1
losetup -d $dev || _fail "Cannot destroy loop device $dev"
}
runner_go()
{
local id=$1
local me=$basedir/runner-$id
local _test=$me/test.img
local _scratch=$me/scratch.img
local _results=$me/results-$2
mkdir -p $me
xfs_io -f -c 'truncate 2g' $_test
xfs_io -f -c 'truncate 8g' $_scratch
mkfs.xfs -f $_test > /dev/null 2>&1
export TEST_DEV=$(_create_loop_device $_test)
export TEST_DIR=$me/test
export SCRATCH_DEV=$(_create_loop_device $_scratch)
export SCRATCH_MNT=$me/scratch
export FSTYP=xfs
export RESULT_BASE=$_results
mkdir -p $TEST_DIR
mkdir -p $SCRATCH_MNT
mkdir -p $RESULT_BASE
rm -f $RESULT_BASE/check.*
# export DUMP_CORRUPT_FS=1
# Run the tests in it's own mount namespace, as per the comment below
# that precedes making the basedir a private mount.
./src/nsexec -m ./check $check_args -x unreliable_in_parallel --exact-order ${runner_list[$id]} > $me/log 2>&1
wait
sleep 1
umount -R $TEST_DIR 2> /dev/null
umount -R $SCRATCH_MNT 2> /dev/null
_destroy_loop_device $TEST_DEV
_destroy_loop_device $SCRATCH_DEV
grep -q Failures: $me/log
if [ $? -eq 0 ]; then
echo -n "Runner $id Failures: "
grep Failures: $me/log | uniq | sed -e "s/^.*Failures://"
fi
}
cleanup()
{
killall -INT -q check
wait
umount -R $basedir/*/test 2> /dev/null
umount -R $basedir/*/scratch 2> /dev/null
losetup --detach-all
}
trap "cleanup; exit" HUP INT QUIT TERM
# Each parallel test runner needs to only see it's own mount points. If we
# leave the basedir as shared, then all tests see all mounts and then we get
# mount propagation issues cropping up. For example, cloning a new mount
# namespace will take a reference to all visible shared mounts and hold them
# while the mount names space is active. This can cause unmount in the test that
# controls the mount to succeed without actually unmounting the filesytsem
# because a mount namespace still holds a reference to it. This causes other
# operations on the block device to fail as it is still busy (e.g. fsck, mkfs,
# etc). Hence we make the basedir private here and then run each check instance
# in it's own mount namespace so that they cannot see mounts that other tests
# are performing.
mount --make-private $basedir
split_runner_list
now=`date +%Y-%m-%d-%H:%M:%S`
for ((i = 0; i < $runners; i++)); do
runner_go $i $now &
done;
wait
echo -n "Tests run: "
grep Ran /mnt/xfs/*/log | sed -e 's,^.*:,,' -e 's, ,\n,g' | sort | uniq | wc -l
echo -n "Failure count: "
grep Failures: $basedir/*/log | uniq | sed -e "s/^.*Failures://" -e "s,\([0-9]\) \([gx]\),\1\n \2,g" |wc -l
echo
echo Ten slowest tests - runtime in seconds:
cat $basedir/*/results/check.time | sort -k 2 -nr | head -10
echo
echo Cleanup on Aisle 5?
echo
losetup --list
ls -l /dev/mapper
df -h |grep xfs