-
Notifications
You must be signed in to change notification settings - Fork 7
/
prepare.sh
executable file
·120 lines (100 loc) · 2.43 KB
/
prepare.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/cs/local/bin/bash
# Author : Mingbin Xu (mingbin.xu@gmail.com)
# Filename : prepare.sh
# Last Update : Feb 2, 2016
# Description : The entry point of LM training
# Website : https://wiki.eecs.yorku.ca/lab/MLL/
# Copyright (c) 2016 iNCML (author: Mingbin Xu)
# License: MIT License (see LICENSE)
# Reset
Color_Off="\033[0m" # Text Reset
# Regular Colors
Red="\033[0;31m" # Red
if [ $# -ne 3 ]
then
printf ${Red}
printf "usage: %s <basename> <min-count> <max-size>\n" $0 1>&2
printf " <basename> : basename of the data set, e.g. ptb \n" 1>&2
printf " <min-count> : words whose occurrence is less than min-count are mapped to <unk>\n" 1>&2
printf " <max-size> : the vocabulary size is limited to max-size\n" 1>&2
printf ${Color_Off}
exit 1
fi
basename=${1}
min_count=${2}
max_size=${3}
for f in "config" "numeric-data" "raw-data" "source" "model-archive"
do
if [ ! -d $f ]
then
printf ${Red}
printf "Please organize the files according to README. \n"
printf ${Color_Off}
exit 1
fi
done
if [ ! -f "trainer" ]
then
cd ./source
nvcc -O3 -arch=sm_30 -lcurand -lcublas -Xcompiler -rdynamic -o ../trainer trainer.cpp network.cpp layer.cpp batch-constructor.cpp matrix.cu
if [ $? -ne 0 ]
then
printf ${Red}
printf "Fail to compile trainer. \n"
printf ${Color_Off}
exit 1
fi
cd ..
echo "traier is compiled"
fi
if [ ! -f "vocabulary" ]
then
cd ./source
g++ -o ../vocabulary vocabulary.cpp -O3 -rdynamic
if [ $? -ne 0 ]
then
printf ${Red}
printf "Fail to compile vocabulary. \n"
printf ${Color_Off}
exit 1
fi
cd ..
echo "vocabulary is compiled"
fi
if [ ! -f "numericize" ]
then
cd ./source
g++ -o ../numericize numericize.cpp -O3 -rdynamic
if [ $? -ne 0 ]
then
printf ${Red}
printf "Fail to compile trainer. \n"
printf ${Color_Off}
exit 1
fi
cd ..
echo "numericize is compiled"
fi
rm -rf ${basename}.vocab
vocabulary "raw-data/"${basename}.train.txt ${min_count} ${max_size} > ${basename}.vocab
if [ $? -ne 0 ]
then
printf ${Red}
printf "Fail to collect vocabulary statistics. \n"
printf ${Color_Off}
exit 1
fi
echo "vocabulary statistics is collected"
for data in `ls raw-data`
do
numericize ${basename}.vocab "raw-data/"${data} "numeric-data/"`basename ${data} .txt`.numeric
if [ $? -ne 0 ]
then
printf ${Red}
printf "Fail to numericize %s. \n" ${data}
printf ${Color_Off}
exit 1
fi
done
echo "data set has been numericized"
echo "preparation done:)"