forked from OpenNMT/OpenNMT-py
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbpe_pipeline.sh
executable file
·163 lines (130 loc) · 4.8 KB
/
bpe_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env bash
# Author : Thamme Gowda
# Created : Nov 06, 2017
ONMT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/.." && pwd )"
#======= EXPERIMENT SETUP ======
# Activate python environment if needed
source ~/.bashrc
# source activate py3
# update these variables
NAME="run1"
OUT="onmt-runs/$NAME"
DATA="$ONMT/onmt-runs/data"
TRAIN_SRC=$DATA/*train.src
TRAIN_TGT=$DATA/*train.tgt
VALID_SRC=$DATA/*dev.src
VALID_TGT=$DATA/*dev.tgt
TEST_SRC=$DATA/*test.src
TEST_TGT=$DATA/*test.tgt
BPE="" # default
BPE="src" # src, tgt, src+tgt
# applicable only when BPE="src" or "src+tgt"
BPE_SRC_OPS=10000
# applicable only when BPE="tgt" or "src+tgt"
BPE_TGT_OPS=10000
GPUARG="" # default
GPUARG="0"
#====== EXPERIMENT BEGIN ======
# Check if input exists
for f in $TRAIN_SRC $TRAIN_TGT $VALID_SRC $VALID_TGT $TEST_SRC $TEST_TGT; do
if [[ ! -f "$f" ]]; then
echo "Input File $f doesnt exist. Please fix the paths"
exit 1
fi
done
function lines_check {
l1=`wc -l $1`
l2=`wc -l $2`
if [[ $l1 != $l2 ]]; then
echo "ERROR: Record counts doesnt match between: $1 and $2"
exit 2
fi
}
lines_check $TRAIN_SRC $TRAIN_TGT
lines_check $VALID_SRC $VALID_TGT
lines_check $TEST_SRC $TEST_TGT
echo "Output dir = $OUT"
[ -d $OUT ] || mkdir -p $OUT
[ -d $OUT/data ] || mkdir -p $OUT/data
[ -d $OUT/models ] || mkdir $OUT/models
[ -d $OUT/test ] || mkdir -p $OUT/test
echo "Step 1a: Preprocess inputs"
if [[ "$BPE" == *"src"* ]]; then
echo "BPE on source"
# Here we could use more monolingual data
$ONMT/tools/learn_bpe.py -s $BPE_SRC_OPS < $TRAIN_SRC > $OUT/data/bpe-codes.src
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $TRAIN_SRC > $OUT/data/train.src
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $VALID_SRC > $OUT/data/valid.src
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.src < $TEST_SRC > $OUT/data/test.src
else
ln -sf $TRAIN_SRC $OUT/data/train.src
ln -sf $VALID_SRC $OUT/data/valid.src
ln -sf $TEST_SRC $OUT/data/test.src
fi
if [[ "$BPE" == *"tgt"* ]]; then
echo "BPE on target"
# Here we could use more monolingual data
$ONMT/tools/learn_bpe.py -s $BPE_SRC_OPS < $TRAIN_TGT > $OUT/data/bpe-codes.tgt
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $TRAIN_TGT > $OUT/data/train.tgt
$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $VALID_TGT > $OUT/data/valid.tgt
#$ONMT/tools/apply_bpe.py -c $OUT/data/bpe-codes.tgt < $TEST_TGT > $OUT/data/test.tgt
# We dont touch the test References, No BPE on them!
ln -sf $TEST_TGT $OUT/data/test.tgt
else
ln -sf $TRAIN_TGT $OUT/data/train.tgt
ln -sf $VALID_TGT $OUT/data/valid.tgt
ln -sf $TEST_TGT $OUT/data/test.tgt
fi
#: <<EOF
echo "Step 1b: Preprocess"
python $ONMT/preprocess.py \
-train_src $OUT/data/train.src \
-train_tgt $OUT/data/train.tgt \
-valid_src $OUT/data/valid.src \
-valid_tgt $OUT/data/valid.tgt \
-save_data $OUT/data/processed
echo "Step 2: Train"
GPU_OPTS=""
if [[ ! -z $GPUARG ]]; then
GPU_OPTS="-gpu_ranks $GPUARG"
fi
CMD="python $ONMT/train.py -data $OUT/data/processed -save_model $OUT/models/$NAME $GPU_OPTS"
echo "Training command :: $CMD"
eval "$CMD"
#EOF
# select a model with high accuracy and low perplexity
# TODO: currently using linear scale, maybe not be the best
model=`ls $OUT/models/*.pt| awk -F '_' 'BEGIN{maxv=-1000000} {score=$(NF-3)-$(NF-1); if (score > maxv) {maxv=score; max=$0}} END{ print max}'`
echo "Chosen Model = $model"
if [[ -z "$model" ]]; then
echo "Model not found. Looked in $OUT/models/"
exit 1
fi
GPU_OPTS=""
if [ ! -z $GPUARG ]; then
GPU_OPTS="-gpu $GPUARG"
fi
echo "Step 3a: Translate Test"
python $ONMT/translate.py -model $model \
-src $OUT/data/test.src \
-output $OUT/test/test.out \
-replace_unk -verbose $GPU_OPTS > $OUT/test/test.log
echo "Step 3b: Translate Dev"
python $ONMT/translate.py -model $model \
-src $OUT/data/valid.src \
-output $OUT/test/valid.out \
-replace_unk -verbose $GPU_OPTS > $OUT/test/valid.log
if [[ "$BPE" == *"tgt"* ]]; then
echo "BPE decoding/detokenising target to match with references"
mv $OUT/test/test.out{,.bpe}
mv $OUT/test/valid.out{,.bpe}
cat $OUT/test/valid.out.bpe | sed -E 's/(@@ )|(@@ ?$)//g' > $OUT/test/valid.out
cat $OUT/test/test.out.bpe | sed -E 's/(@@ )|(@@ ?$)//g' > $OUT/test/test.out
fi
echo "Step 4a: Evaluate Test"
$ONMT/tools/multi-bleu-detok.perl $OUT/data/test.tgt < $OUT/test/test.out > $OUT/test/test.tc.bleu
$ONMT/tools/multi-bleu-detok.perl -lc $OUT/data/test.tgt < $OUT/test/test.out > $OUT/test/test.lc.bleu
echo "Step 4b: Evaluate Dev"
$ONMT/tools/multi-bleu-detok.perl $OUT/data/valid.tgt < $OUT/test/valid.out > $OUT/test/valid.tc.bleu
$ONMT/tools/multi-bleu-detok.perl -lc $OUT/data/valid.tgt < $OUT/test/valid.out > $OUT/test/valid.lc.bleu
#===== EXPERIMENT END ======