Skip to content

Commit 9032eb0

Browse files
authored
Add files via upload
1 parent 240a4a1 commit 9032eb0

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

scripts/splitData.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/bin/bash
2+
3+
# This script receives a data directory and randomly splits its subdirectories into two data directores - train_dir and test_dir.
4+
# Arguments:
5+
# /data_dir is the path to the data directory.
6+
# /split_percentage is the proportional size of train_dir.
7+
8+
# calculate the dir's size in M
9+
function dirSize {
10+
echo `du -h -s -m $1 | cut -f1`
11+
}
12+
13+
data_dir=$1
14+
split_percentage=$2
15+
16+
data_dir_size=`dirSize $data_dir`
17+
let data_dir_size-=`du -h -s -m --separate-dirs $data_dir | cut -f1`
18+
if ((data_dir_size == 0)); then
19+
echo "Error: $data_dir is empty!"
20+
exit
21+
fi
22+
23+
if ((split_percentage < 0 || split_percentage > 100)); then
24+
echo "Error: Invalid parameter: split percentage should be a value between 0 and 100"
25+
exit
26+
fi
27+
28+
train_dir=${data_dir}train_dir
29+
test_dir=${data_dir}test_dir
30+
mkdir -p ${train_dir} ${test_dir}
31+
32+
split_rate=`echo "scale=2;
33+
$split_percentage / 100" \
34+
| bc`
35+
36+
sub_dirs=`find $data_dir -maxdepth 1 -type d | tail --lines=+2`
37+
sub_dirs=`shuf -e $(echo ${sub_dirs[*]})`
38+
39+
target_dir=$train_dir
40+
i=0
41+
for sub_dir in ${sub_dirs[*]}
42+
do
43+
let i++
44+
if [[ $sub_dir == $train_dir || $sub_dir == $test_dir ]]; then
45+
continue
46+
fi
47+
mv $sub_dir $target_dir
48+
if [[ $target_dir == $train_dir ]]; then
49+
train_dir_size=`dirSize $train_dir`
50+
proportion=`echo "scale=2; $train_dir_size / $data_dir_size" | bc`
51+
printf "\rtrain dir size: ${train_dir_size}M ; split rate: %.2f" $proportion
52+
if (( $(echo "$proportion >= $split_rate" | bc -l) )); then
53+
target_dir=$test_dir
54+
printf '\n'
55+
fi
56+
else
57+
test_dir_size=`dirSize $test_dir`
58+
printf "\rtest dir size: ${test_dir_size}M"
59+
fi
60+
done
61+
printf "\n"
62+
63+
echo "data dir: $data_dir"
64+
echo "train dir: $train_dir"
65+
echo "test dir: $test_dir"
66+
echo "data dir size: ${data_dir_size}M"
67+
echo "train dir size: ${train_dir_size}M"
68+
echo "test dir size: ${test_dir_size}M"
69+
printf "final split rate = %.2f\n" $proportion

0 commit comments

Comments
 (0)