File tree 1 file changed +2
-2
lines changed
1 file changed +2
-2
lines changed Original file line number Diff line number Diff line change 29
29
30
30
cleantext=$dir /text.no_oov
31
31
32
+ # note: ignore 1st field of text, it's the utterance-id.
32
33
cat $text | awk ' {$1=""; print substr($0, 2)}' | awk -v lex=$lexicon ' BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
33
34
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
34
35
> $cleantext || exit 1;
@@ -47,9 +48,8 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
47
48
cat $dir /unigram.counts | awk ' {print $2}' | get_word_map.pl " <s>" " </s>" " <UNK>" > $dir /word_map \
48
49
|| exit 1;
49
50
50
- # note: ignore 1st field of train.txt, it's the utterance-id.
51
51
cat $cleantext | awk -v wmap=$dir /word_map ' BEGIN{while((getline<wmap)>0)map[$1]=$2;}
52
- { for(n=2 ;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c > $dir /train.gz \
52
+ { for(n=1 ;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c > $dir /train.gz \
53
53
|| exit 1;
54
54
55
55
train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;
You can’t perform that action at this time.
0 commit comments