Skip to content

Commit

Permalink
[egs] Changes to IAM handwriting-recognition recipe, including BPE en…
Browse files Browse the repository at this point in the history
…coding (#2658)
  • Loading branch information
aarora8 authored and danpovey committed Sep 12, 2018
1 parent c99a860 commit 17b8f6d
Show file tree
Hide file tree
Showing 24 changed files with 1,412 additions and 566 deletions.
137 changes: 78 additions & 59 deletions egs/iam/v1/local/unk_arc_post_to_transcription.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,107 @@
#!/usr/bin/env python3

# Copyright 2017 Ashish Arora
#Copyright 2017 Ashish Arora

""" This module will be used by scripts for open vocabulary setup.
If the hypothesis transcription contains <unk>, then it will replace the
<unk> with the word predicted by <unk> model by concatenating phones decoded
from the unk-model. It is currently supported only for triphone setup.
Args:
phones: File name of a file that contains the phones.txt, (symbol-table for phones).
phone and phoneID, Eg. a 217, phoneID of 'a' is 217.
words: File name of a file that contains the words.txt, (symbol-table for words).
word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234.
unk: ID of <unk>. Eg. 231.
one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior
of arcs along the one-best path from the lattice.
E.g. 506_m01-049-00 8 12 1 7722 282 272 288 231
<utterance-id> <start-frame> <num-frames> <posterior> <word> [<ali>]
[<phone1> <phone2>...]
output-text: File containing hypothesis transcription with <unk> recognized by the
unk-model.
E.g. A move to stop mr. gaitskell.
Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt
data/lang/oov.int
"""
import argparse
import os
import sys

parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
parser.add_argument('phones', type=str, help='phones and phonesID')
parser.add_argument('words', type=str, help='word and wordID')
parser.add_argument('unk', type=str, default='-', help='location of unk file')
parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
parser.add_argument('phones', type=str, help='File name of a file that contains the'
'symbol-table for phones. Each line must be: <phone> <phoneID>')
parser.add_argument('words', type=str, help='File name of a file that contains the'
'symbol-table for words. Each line must be: <word> <word-id>')
parser.add_argument('unk', type=str, default='-', help='File name of a file that'
'contains the ID of <unk>. The content must be: <oov-id>, e.g. 231')
parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post'
'format, which is a list of timing info and posterior of arcs'
'along the one-best path from the lattice')
parser.add_argument('--output-text', type=str, default='-', help='File containing'
'hypothesis transcription with <unk> recognized by the unk-model')
args = parser.parse_args()


### main ###
phone_fh = open(args.phones, 'r', encoding='latin-1')
word_fh = open(args.words, 'r', encoding='latin-1')
unk_fh = open(args.unk, 'r', encoding='latin-1')
if args.input_ark == '-':
input_fh = sys.stdin
phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles
word_handle = open(args.words, 'r', encoding='latin-1')
unk_handle = open(args.unk,'r', encoding='latin-1')
if args.one_best_arc_post == '-':
arc_post_handle = sys.stdin
else:
input_fh = open(args.input_ark, 'r', encoding='latin-1')
if args.out_ark == '-':
out_fh = sys.stdout
arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1')
if args.output_text == '-':
output_text_handle = sys.stdout
else:
out_fh = open(args.out_ark, 'w', encoding='latin-1')
output_text_handle = open(args.output_text, 'w', encoding='latin-1')

phone_dict = dict() # Stores phoneID and phone mapping
phone_data_vect = phone_fh.read().strip().split("\n")
for key_val in phone_data_vect:
id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
phones_data = phone_handle.read().strip().split("\n")

for key_val in phones_data:
key_val = key_val.split(" ")
phone_dict[key_val[1]] = key_val[0]
id2phone[key_val[1]] = key_val[0]

word_dict = dict()
word_data_vect = word_fh.read().strip().split("\n")
word_data_vect = word_handle.read().strip().split("\n")

for key_val in word_data_vect:
key_val = key_val.split(" ")
word_dict[key_val[1]] = key_val[0]
unk_val = unk_fh.read().strip().split(" ")[0]
unk_val = unk_handle.read().strip().split(" ")[0]

utt_word_dict = dict()
utt_phone_dict = dict() # Stores utteranceID and phoneID
unk_word_dict = dict()
count=0
for line in input_fh:
utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str)
for line in arc_post_handle:
line_vect = line.strip().split("\t")
if len(line_vect) < 6:
print("Bad line: '{}' Expecting 6 fields. Skipping...".format(line),
if len(line_vect) < 6: # Check for 1best-arc-post output
print("Error: Bad line: '{}' Expecting 6 fields. Skipping...".format(line),
file=sys.stderr)
continue
uttID = line_vect[0]
utt_id = line_vect[0]
word = line_vect[4]
phones = line_vect[5]
if uttID in utt_word_dict.keys():
utt_word_dict[uttID][count] = word
utt_phone_dict[uttID][count] = phones
else:
count = 0
utt_word_dict[uttID] = dict()
utt_phone_dict[uttID] = dict()
utt_word_dict[uttID][count] = word
utt_phone_dict[uttID][count] = phones
if word == unk_val: # Get character sequence for unk
phone_key_vect = phones.split(" ")
phone_val_vect = list()
for pkey in phone_key_vect:
phone_val_vect.append(phone_dict[pkey])
if utt_id not in list(utt_word_dict.keys()):
utt_word_dict[utt_id] = list()

if word == unk_val: # Get the 1best phone sequence given by the unk-model
phone_id_seq = phones.split(" ")
phone_seq = list()
for pkey in phone_id_seq:
phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence.
phone_2_word = list()
for phone_val in phone_val_vect:
phone_2_word.append(phone_val.split('_')[0])
phone_2_word = ''.join(phone_2_word)
utt_word_dict[uttID][count] = phone_2_word
for phone_val in phone_seq:
phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B)
phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence
utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model
else:
if word == '0':
if word == '0': # Store space/silence
word_val = ' '
else:
word_val = word_dict[word]
utt_word_dict[uttID][count] = word_val
count += 1
utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post

transcription = ""
for key in sorted(utt_word_dict.keys()):
transcription = key
for index in sorted(utt_word_dict[key].keys()):
value = utt_word_dict[key][index]
transcription = transcription + " " + value
out_fh.write(transcription + '\n')
transcription = "" # Output transcription
for utt_key in sorted(utt_word_dict.keys()):
transcription = utt_key
for word in utt_word_dict[utt_key]:
transcription = transcription + " " + word
output_text_handle.write(transcription + '\n')
Empty file modified egs/iam/v2/cmd.sh
100644 → 100755
Empty file.
34 changes: 34 additions & 0 deletions egs/iam/v2/local/augment_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
# Copyright 2018 Hossein Hadian
# 2018 Ashish Arora

# Apache 2.0
# This script performs data augmentation.

nj=4
cmd=run.pl
feat_dim=40
echo "$0 $@"

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh || exit 1;

srcdir=$1
outdir=$2
datadir=$3
aug_set=aug1
mkdir -p $datadir/augmentations
echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp"

for set in $aug_set; do
image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
$srcdir $datadir/augmentations/$set
cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
--fliplr false --augment true $datadir/augmentations/$set
done

echo " combine original data and data from different augmentations"
utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set
cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
30 changes: 30 additions & 0 deletions egs/iam/v2/local/chain/compare_wer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,36 @@ for x in $*; do
done
echo

echo -n "# WER val "
for x in $*; do
wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# WER (rescored) val "
for x in $*; do
wer="--"
[ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# CER val "
for x in $*; do
cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

echo -n "# CER (rescored) val "
for x in $*; do
cer="--"
[ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

if $used_epochs; then
exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
fi
Expand Down
2 changes: 1 addition & 1 deletion egs/iam/v2/local/chain/run_cnn_e2eali.sh
Loading

0 comments on commit 17b8f6d

Please sign in to comment.