[egs] Changes to IAM handwriting-recognition recipe, including BPE en…

…coding (#2658)
kaldi-asr · Sep 12, 2018 · 17b8f6d · 17b8f6d
1 parent c99a860
commit 17b8f6d
Show file tree

Hide file tree

Showing 24 changed files with 1,412 additions and 566 deletions.
diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py
@@ -1,88 +1,107 @@
 #!/usr/bin/env python3
 
-# Copyright     2017  Ashish Arora
+#Copyright      2017  Ashish Arora
 
+""" This module will be used by scripts for open vocabulary setup.
+ If the hypothesis transcription contains <unk>, then it will replace the 
+ <unk> with the word predicted by <unk> model by concatenating phones decoded 
+ from the unk-model. It is currently supported only for triphone setup.
+ Args:
+  phones: File name of a file that contains the phones.txt, (symbol-table for phones).
+          phone and phoneID, Eg. a 217, phoneID of 'a' is 217. 
+  words: File name of a file that contains the words.txt, (symbol-table for words). 
+         word and wordID. Eg. ACCOUNTANCY 234, wordID of 'ACCOUNTANCY' is 234.
+  unk: ID of <unk>. Eg. 231.
+  one-best-arc-post: A file in arc-post format, which is a list of timing info and posterior
+               of arcs along the one-best path from the lattice.
+               E.g. 506_m01-049-00 8 12  1 7722  282 272 288 231
+                    <utterance-id> <start-frame> <num-frames> <posterior> <word> [<ali>] 
+                    [<phone1> <phone2>...]
+  output-text: File containing hypothesis transcription with <unk> recognized by the
+               unk-model.
+               E.g. A move to stop mr. gaitskell.
+  
+  Eg. local/unk_arc_post_to_transcription.py lang/phones.txt lang/words.txt 
+      data/lang/oov.int
+"""
 import argparse
+import os
 import sys
-
 parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
-parser.add_argument('phones', type=str, help='phones and phonesID')
-parser.add_argument('words', type=str, help='word and wordID')
-parser.add_argument('unk', type=str, default='-', help='location of unk file')
-parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
-parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
+parser.add_argument('phones', type=str, help='File name of a file that contains the'
+                    'symbol-table for phones. Each line must be: <phone> <phoneID>')
+parser.add_argument('words', type=str, help='File name of a file that contains the'
+                    'symbol-table for words. Each line must be: <word> <word-id>')
+parser.add_argument('unk', type=str, default='-', help='File name of a file that'
+                    'contains the ID of <unk>. The content must be: <oov-id>, e.g. 231')
+parser.add_argument('--one-best-arc-post', type=str, default='-', help='A file in arc-post'
+                    'format, which is a list of timing info and posterior of arcs'
+                    'along the one-best path from the lattice')
+parser.add_argument('--output-text', type=str, default='-', help='File containing'
+                    'hypothesis transcription with <unk> recognized by the unk-model')
 args = parser.parse_args()
 
-
 ### main ###
-phone_fh = open(args.phones, 'r', encoding='latin-1')
-word_fh = open(args.words, 'r', encoding='latin-1')
-unk_fh = open(args.unk, 'r', encoding='latin-1')
-if args.input_ark == '-':
-    input_fh = sys.stdin
+phone_handle = open(args.phones, 'r', encoding='latin-1') # Create file handles 
+word_handle = open(args.words, 'r', encoding='latin-1')
+unk_handle = open(args.unk,'r', encoding='latin-1')
+if args.one_best_arc_post == '-':
+    arc_post_handle = sys.stdin
 else:
-    input_fh = open(args.input_ark, 'r', encoding='latin-1')
-if args.out_ark == '-':
-    out_fh = sys.stdout
+    arc_post_handle = open(args.one_best_arc_post, 'r', encoding='latin-1')
+if args.output_text == '-':
+    output_text_handle = sys.stdout
 else:
-    out_fh = open(args.out_ark, 'w', encoding='latin-1')
+    output_text_handle = open(args.output_text, 'w', encoding='latin-1')
 
-phone_dict = dict()  # Stores phoneID and phone mapping
-phone_data_vect = phone_fh.read().strip().split("\n")
-for key_val in phone_data_vect:
+id2phone = dict() # Stores the mapping from phone_id (int) to phone (char)
+phones_data = phone_handle.read().strip().split("\n")
+
+for key_val in phones_data:
   key_val = key_val.split(" ")
-  phone_dict[key_val[1]] = key_val[0]
+  id2phone[key_val[1]] = key_val[0]
+
 word_dict = dict()
-word_data_vect = word_fh.read().strip().split("\n")
+word_data_vect = word_handle.read().strip().split("\n")
+
 for key_val in word_data_vect:
   key_val = key_val.split(" ")
   word_dict[key_val[1]] = key_val[0]
-unk_val = unk_fh.read().strip().split(" ")[0]
+unk_val = unk_handle.read().strip().split(" ")[0]
 
-utt_word_dict = dict()
-utt_phone_dict = dict()  # Stores utteranceID and phoneID
-unk_word_dict = dict()
-count=0
-for line in input_fh:
+utt_word_dict = dict() # Dict of list, stores mapping from utteranceID(int) to words(str)
+for line in arc_post_handle:
   line_vect = line.strip().split("\t")
-  if len(line_vect) < 6:
-    print("Bad line: '{}'   Expecting 6 fields. Skipping...".format(line),
+  if len(line_vect) < 6: # Check for 1best-arc-post output
+    print("Error: Bad line: '{}'   Expecting 6 fields. Skipping...".format(line),
           file=sys.stderr)
     continue
-  uttID = line_vect[0]
+  utt_id = line_vect[0]
   word = line_vect[4]
   phones = line_vect[5]
-  if uttID in utt_word_dict.keys():
-    utt_word_dict[uttID][count] = word
-    utt_phone_dict[uttID][count] = phones
-  else:
-    count = 0
-    utt_word_dict[uttID] = dict()
-    utt_phone_dict[uttID] = dict()
-    utt_word_dict[uttID][count] = word
-    utt_phone_dict[uttID][count] = phones
-  if word == unk_val:   # Get character sequence for unk
-    phone_key_vect = phones.split(" ")
-    phone_val_vect = list()
-    for pkey in phone_key_vect:
-      phone_val_vect.append(phone_dict[pkey])
+  if utt_id not in list(utt_word_dict.keys()):
+    utt_word_dict[utt_id] = list()
+
+  if word == unk_val: # Get the 1best phone sequence given by the unk-model
+    phone_id_seq = phones.split(" ")
+    phone_seq = list()
+    for pkey in phone_id_seq:
+      phone_seq.append(id2phone[pkey]) # Convert the phone-id sequence to a phone sequence.
     phone_2_word = list()
-    for phone_val in phone_val_vect:
-      phone_2_word.append(phone_val.split('_')[0])
-    phone_2_word = ''.join(phone_2_word)
-    utt_word_dict[uttID][count] = phone_2_word
+    for phone_val in phone_seq:
+      phone_2_word.append(phone_val.split('_')[0]) # Removing the world-position markers(e.g. _B)
+    phone_2_word = ''.join(phone_2_word) # Concatnate phone sequence
+    utt_word_dict[utt_id].append(phone_2_word) # Store word from unk-model
   else:
-    if word == '0':
+    if word == '0': # Store space/silence
       word_val = ' '
     else:
       word_val = word_dict[word]
-    utt_word_dict[uttID][count] = word_val
-  count += 1
+    utt_word_dict[utt_id].append(word_val) # Store word from 1best-arc-post
 
-transcription = ""
-for key in sorted(utt_word_dict.keys()):
-  transcription = key
-  for index in sorted(utt_word_dict[key].keys()):
-    value = utt_word_dict[key][index]
-    transcription = transcription + " " + value
-  out_fh.write(transcription + '\n')
+transcription = "" # Output transcription
+for utt_key in sorted(utt_word_dict.keys()):
+  transcription = utt_key
+  for word in utt_word_dict[utt_key]:
+    transcription = transcription + " " + word
+  output_text_handle.write(transcription + '\n')
diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh
diff --git a/egs/iam/v2/local/augment_data.sh b/egs/iam/v2/local/augment_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+aug_set=aug1
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp"
+
+for set in $aug_set; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --fliplr false --augment true $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh
@@ -50,6 +50,36 @@ for x in $*; do
 done
 echo
 
+echo -n "# WER val                    "
+for x in $*; do
+  wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored) val         "
+for x in $*; do
+  wer="--"
+  [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val                    "
+for x in $*; do
+  cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored) val         "
+for x in $*; do
+  cer="--"
+  [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi

diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh
@@ -1 +1 @@
-tuning/run_cnn_e2eali_1c.sh
+tuning/run_cnn_e2eali_1d.sh