kaldi-asr · danpovey · Sep 6, 2018 · Aug 12, 2018 · Aug 12, 2018 · Aug 12, 2018
diff --git a/.gitignore b/.gitignore
@@ -79,6 +79,7 @@ GSYMS
 /egs/*/s*/data
 
 # /tools/
+/tools/pocolm/
 /tools/ATLAS/
 /tools/atlas3.8.3.tar.gz
 /tools/irstlm/

diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Yiwen Shao
+#                2018  Hossein Hadian
+
+""" This script converts images to Kaldi-format feature matrices. The input to
+    this script is the path to a data directory, e.g. "data/train". This script
+    reads the images listed in images.scp and writes them to standard output
+    (by default) as Kaldi-formatted matrices (in text form). It also scales the
+    images so they have the same height (via --feat-dim). It can optionally pad
+    the images (on left/right sides) with white pixels. It by default performs 
+    augmentation, (directly scaling down and scaling up). It will double the 
+    data but we can turn augmentation off (via --no-augment).
+    If an 'image2num_frames' file is found in the data dir, it will be used
+    to enforce the images to have the specified length in that file by padding
+    white pixels (the --padding option will be ignored in this case). This relates
+    to end2end chain training.
+    eg. local/make_features.py data/train --feat-dim 40
+"""
+import random
+import argparse
+import os
+import sys
+import numpy as np
+from scipy import misc
+import math
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE, SIG_DFL)
+
+parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
+                                                writes them to standard output in text format.""")
+parser.add_argument('images_scp_path', type=str,
+                    help='Path of images.scp file')
+parser.add_argument('--allowed_len_file_path', type=str, default=None,
+                    help='If supplied, each images will be padded to reach the '
+                    'target length (this overrides --padding).')
+parser.add_argument('--out-ark', type=str, default='-',
+                    help='Where to write the output feature file')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='Size to scale the height of all images')
+parser.add_argument('--padding', type=int, default=5,
+                    help='Number of white pixels to pad on the left'
+                    'and right side of the image.')
+parser.add_argument("--augment", action="store_true",
+                   help="whether or not to do image augmentation")
+parser.add_argument("--flip", action="store_true",
+                   help="whether or not to flip the image")
+args = parser.parse_args()
+
+
+def write_kaldi_matrix(file_handle, matrix, key):
+    file_handle.write(key + " [ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to "
+                            "have the same length")
+        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file_handle.write("\n")
+    file_handle.write(" ]\n")
+
+
+def horizontal_pad(im, allowed_lengths = None):
+    if allowed_lengths is None:
+        left_padding = right_padding = args.padding
+    else:  # Find an allowed length for the image
+        imlen = im.shape[1] # width
+        allowed_len = 0
+        for l in allowed_lengths:
+            if l > imlen:
+                allowed_len = l
+                break
+        if allowed_len == 0:
+            #  No allowed length was found for the image (the image is too long)
+            return None
+        padding = allowed_len - imlen
+        left_padding = int(padding // 2)
+        right_padding = padding - left_padding
+    dim_y = im.shape[0] # height
+    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
+                                           dtype=int), im), axis=1)
+    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
+                                                    dtype=int)), axis=1)
+    return im_pad1
+
+def get_scaled_image_aug(im, mode='normal'):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx) 
+    scale_size = random.randint(10, 30)
+    scale = (1.0 * scale_size) / sy
+    down_nx = int(scale_size)
+    down_ny = int(scale * sx)
+    if mode == 'normal':
+        im = misc.imresize(im, (nx, ny))
+        return im
+    else:
+        im_scaled_down = misc.imresize(im, (down_nx, down_ny))
+        im_scaled_up = misc.imresize(im_scaled_down, (nx, ny))
+        return im_scaled_up
+    return im
+
+
+### main ###
+random.seed(1)
+data_list_path = args.images_scp_path
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'w')
+
+allowed_lengths = None
+allowed_len_handle = args.allowed_len_file_path
+if os.path.isfile(allowed_len_handle):
+    print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
+    allowed_lengths = []
+    with open(allowed_len_handle) as f:
+        for line in f:
+            allowed_lengths.append(int(line.strip()))
+    print("Read {} allowed lengths and will apply them to the "
+          "features.".format(len(allowed_lengths)), file=sys.stderr)
+
+num_fail = 0
+num_ok = 0
+aug_setting = ['normal', 'scaled']
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+        im = misc.imread(image_path)
+        if args.flip:
+            im = np.fliplr(im)
+        if args.augment:
+            for i in range(2):
+                image_aug_id = image_id + '_aug' + str(i + 1)
+                im_aug = get_scaled_image_aug(im, aug_setting[i])
+                im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
+                if im_horizontal_padded is None:
+                    num_fail += 1
+                    continue
+                data = np.transpose(im_horizontal_padded, (1, 0))
+                data = np.divide(data, 255.0)
+                num_ok += 1
+                write_kaldi_matrix(out_fh, data, image_aug_id)
+        else:
+            image_aug_id = image_id + '_aug'
+            im_aug = get_scaled_image_aug(im, aug_setting[0])
+            im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths)
+            if im_horizontal_padded is None:
+                num_fail += 1
+                continue
+            data = np.transpose(im_horizontal_padded, (1, 0))
+            data = np.divide(data, 255.0)
+            num_ok += 1
+            write_kaldi_matrix(out_fh, data, image_aug_id)
+
+print('Generated features for {} images. Failed for {} (image too '
+      'long).'.format(num_ok, num_fail), file=sys.stderr)
diff --git a/egs/cifar/v1/image/ocr/process_augment_data.py b/egs/cifar/v1/image/ocr/process_augment_data.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script reads the feats.scp file and regenerate images.scp, text,
+# utt2spk and spk2utt. Apart from original images.scp, text, utt2spk new 
+# values were created due to augmentation (doubles the data). Hence to 
+# map newly created feats, to text, images.scp and utt2spk this script is used.
+
+import os
+import argparse
+parser = argparse.ArgumentParser(description="""Regenerate images.scp, text,
+                    utt2spk and spk2utt from feats.scp for augment data.""")
+parser.add_argument('dir', type=str,
+                    help='directory of images.scp')
+args = parser.parse_args()
+
+text_file = os.path.join(args.dir, 'backup', 'text')
+uttid_to_text = dict()  #stores uttid and text
+with open(text_file) as text_fh:
+    for uttid_text in text_fh:
+        uttid_text_vect = uttid_text.strip().split(" ")
+        utt_id = uttid_text_vect[0]
+        text_vect = uttid_text_vect[1:]
+        text = " ".join(text_vect)
+        uttid_to_text[utt_id] = text
+
+utt2spk_file = os.path.join(args.dir, 'backup', 'utt2spk')
+uttid_to_spk = dict()  #stores uttid and speaker
+with open(utt2spk_file) as utt2spk_fh:
+    for uttid_spk in utt2spk_fh:
+        uttid_spk_vect = uttid_spk.strip().split(" ")
+        utt_id = uttid_spk_vect[0]
+        spk = uttid_spk_vect[1]
+        uttid_to_spk[utt_id] = spk
+
+image_file = os.path.join(args.dir, 'backup', 'images.scp')
+uttid_to_path = dict()  # stores uttid and image path
+with open(image_file) as image_fh:
+    for uttid_path in image_fh:
+        uttid_path_vect = uttid_path.strip().split(" ")
+        utt_id = uttid_path_vect[0]
+        path = uttid_path_vect[1]
+        uttid_to_path[utt_id] = path
+
+image_file = os.path.join(args.dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+text_file = os.path.join(args.dir + '/', 'text')
+text_fh = open(text_file, 'w')
+utt2spk_file = os.path.join(args.dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+feats_scp_file = os.path.join(args.dir, 'feats.scp')
+with open(feats_scp_file) as feats_scp_fh:
+    for uttid_feats in feats_scp_fh:
+        uttid_feats_vect = uttid_feats.strip().split(" ")
+        utt_id = uttid_feats_vect[0]
+        image_id = "_".join(utt_id.split("_")[:-1])
+        text_fh.write(utt_id + ' ' + uttid_to_text[image_id] + '\n')
+        utt2spk_fh.write(utt_id + ' ' + uttid_to_spk[image_id] + '\n')
+        image_fh.write(utt_id + ' ' + uttid_to_path[image_id] + '\n')
+
diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh
@@ -26,7 +26,7 @@ stage=0
 nj=30
 train_set=train
 nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
-affix=_1b6  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+affix=_1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 e2echain_model_dir=exp/chain/e2e_cnn_1a
 common_egs_dir=
 reporting_email=

diff --git a/egs/wsj/s5/utils/lang/bpe/prepend_words.py b/egs/wsj/s5/utils/lang/bpe/prepend_words.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# This script, prepend '|' to every words in the transcript to mark
+# the beginning of the words for finding the initial-space of every word
+# after decoding.
+
+import sys, io
+
+infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
+output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
+for line in infile:
+    output.write(' '.join([ "|"+word for word in line.split()]) + '\n')
+
+
diff --git a/egs/yomdle_tamil/v1/cmd.sh b/egs/yomdle_tamil/v1/cmd.sh
@@ -0,0 +1,12 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+export cmd="queue.pl"
diff --git a/egs/yomdle_tamil/v1/image b/egs/yomdle_tamil/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
diff --git a/egs/yomdle_tamil/v1/local/chain/compare_wer.sh b/egs/yomdle_tamil/v1/local/chain/compare_wer.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# WER (rescored)             "
+for x in $*; do
+  wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# CER (rescored)             "
+for x in $*; do
+  cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_tamil/v1/local/chain/run_cnn_e2eali.sh b/egs/yomdle_tamil/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1b.sh