Skip to content

Commit 1bf7a16

Browse files
authored
[egs] multi_cn: fix false removal of 1st field of clean text (kaldi-asr#4020)
1 parent e5a5a28 commit 1bf7a16

File tree

1 file changed

+2
-2
lines changed

1 file changed

+2
-2
lines changed

egs/multi_cn/s5/local/train_lms.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ fi
2929

3030
cleantext=$dir/text.no_oov
3131

32+
# note: ignore 1st field of text, it's the utterance-id.
3233
cat $text | awk '{$1=""; print substr($0, 2)}' | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
3334
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<UNK> ");} } printf("\n");}' \
3435
> $cleantext || exit 1;
@@ -47,9 +48,8 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
4748
cat $dir/unigram.counts | awk '{print $2}' | get_word_map.pl "<s>" "</s>" "<UNK>" > $dir/word_map \
4849
|| exit 1;
4950

50-
# note: ignore 1st field of train.txt, it's the utterance-id.
5151
cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline<wmap)>0)map[$1]=$2;}
52-
{ for(n=2;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
52+
{ for(n=1;n<=NF;n++) { printf map[$n]; if(n<NF){ printf " "; } else { print ""; }}}' | gzip -c >$dir/train.gz \
5353
|| exit 1;
5454

5555
train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1;

0 commit comments

Comments
 (0)