Big rewrite

taltechnlp · Apr 14, 2022 · 1983219 · 1983219
1 parent 949ab59
commit 1983219
Show file tree

Hide file tree

Showing 37 changed files with 1,922 additions and 436 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,99 @@
-FROM alumae/kaldi-offline-transcriber-et
+FROM kaldiasr/kaldi:latest 
+MAINTAINER Tanel Alumae <alumae@gmail.com>
 
-LABEL maintainer="Aivo Olev"
+RUN apt-get update && apt-get install -y  \
+    autoconf \
+    automake \
+    bzip2 \
+    g++ \
+    gfortran \
+    git \
+    libatlas3-base \
+    libtool-bin \
+    make \
+    python2.7 \
+    python-pip \
+    python-dev \
+    sox \
+    ffmpeg \
+    subversion \
+    wget \
+    zlib1g-dev && \
+    apt-get clean autoclean && \
+    apt-get autoremove -y
 
-COPY scripts/diarization.sh /opt/kaldi-offline-transcriber/scripts/diarization.sh
-COPY .gitignore /opt/kaldi-offline-transcriber/.gitignore
-COPY transcribe.nf /opt/kaldi-offline-transcriber/
 
-RUN apt-get update && apt-get install -y procps     
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
 
-CMD ["/bin/bash"]   
+RUN wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+    && mkdir /root/.conda \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh 
+
+RUN conda --version
+
+RUN conda install -c conda-forge pynini=2.1.3
+
+RUN conda install pytorch=1.8.1 torchvision torchaudio=0.8.1 cpuonly -c pytorch
+
+RUN conda install ruamel.yaml && \
+    pip install kaldiio && \
+    pip install simplejson && \
+    pip install pytest
+
+RUN pip install speechbrain
+
+WORKDIR /opt
+
+RUN git clone https://github.com/alumae/et-g2p-fst.git    
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales
+
+RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \
+    dpkg-reconfigure --frontend=noninteractive locales && \
+    update-locale LANG=en_US.UTF-8
+
+ENV LANG en_US.UTF-8
+
+RUN apt-get install -y openjdk-8-jre-headless
+
+RUN cd /opt/kaldi/tools && \
+    extras/install_pocolm.sh
+
+ENV HOME /opt
+ENV LD_LIBRARY_PATH /usr/local/lib
+
+RUN ln -s -f /usr/bin/python2 /usr/bin/python && \
+    apt-get install -y python-numpy python-scipy python3-simplejson python3-pytest && \
+    pip2 install theano --no-deps
+
+# Set up punctuator    
+RUN mkdir -p /opt/est-asr-pipeline && \
+    cd /opt/est-asr-pipeline && \
+    wget -q -O - http://bark.phon.ioc.ee/tanel/est_punct2.tar.gz | tar xvz
+
+RUN cd /opt/est-asr-pipeline && \
+    wget -q -O - http://bark.phon.ioc.ee/tanel/kaldi-offline-transcriber-data-2021-06-11.tgz | tar xvz
+
+
+COPY bin /opt/est-asr-pipeline/bin
+
+ENV KALDI_ROOT /opt/kaldi
+
+RUN cd /opt/est-asr-pipeline && \
+    touch -m path.sh && \
+    ./bin/compile_models.sh
+
+# This can be removed once the base data pack has been fixed
+RUN echo '--sample-frequency=16000' >  /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \
+    echo '--frame-length=25' >>  /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \
+    echo '--low-freq=20' >>  /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \
+    echo '--high-freq=7600' >>  /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \
+    echo '--num-mel-bins=30' >>  /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \
+    echo '--num-ceps=24' >>  /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \
+    echo '--snip-edges=false' >>  /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf
+
+
+CMD ["/bin/bash"]    
diff --git a/README.md b/README.md
@@ -17,52 +17,75 @@ Install Nextflow locally (depends on Java 8, refer to official documentation in
 
 ## Usage
 
-First build the container:
+### Using a prebuilt Docker image
 
-    docker build . -t nextflow
+Run:
 
-Start the container (named "nextflow:latest") and put it into background (`-d`). Also, mount a local directory `~/tmp/speechfiles` to the following directory in the container: `/opt/speechfiles`.
+    nextflow run transcribe.nf --in /path/to/some_audiofile.mp3
 
-    mkdir -p ~/tmp/speechfiles
-    docker run --name nextflow -v ~/tmp/speechfiles:/opt/speechfiles --rm -d -t nextflow:latest
+The first invocation might take some time, because the required Docker container
+containing all the needed models and libraries is automatically pulled from the remote registry.
 
-To transcribe a speech recording provide at least input file, output file and file format as parameters. Also the -with-docker command line option should refer to the locally built and running Docker container:
+A successful invocation will result with something like this:
 
-    nextflow run transcribe.nf -with-docker nextflow --in ~/audio.mp3 --out result.json --file_ext mp3
+	N E X T F L O W  ~  version 21.10.6
+	Launching `transcribe.nf` [backstabbing_baekeland] - revision: 7ee707faa8
+	executor >  local (12)
+	[ae/4b81cd] process > to_wav                   [100%] 1 of 1 ✔
+	[27/370fbd] process > diarization              [100%] 1 of 1 ✔
+	[ec/c58cee] process > prepare_initial_data_dir [100%] 1 of 1 ✔
+	[c6/299141] process > language_id              [100%] 1 of 1 ✔
+	[d8/9fcbc7] process > mfcc                     [100%] 1 of 1 ✔
+	[85/a6589b] process > speaker_id               [100%] 1 of 1 ✔
+	[09/f7d366] process > one_pass_decoding        [100%] 1 of 1 ✔
+	[0e/fd2533] process > rnnlm_rescoring          [100%] 1 of 1 ✔
+	[c0/461429] process > lattice2ctm              [100%] 1 of 1 ✔
+	[b4/04eeee] process > to_json                  [100%] 1 of 1 ✔
+	[37/3d7f31] process > punctuation              [100%] 1 of 1 ✔
+	[86/90a123] process > output                   [100%] 1 of 1 ✔
+	Completed at: 14-apr-2022 10:55:48
+	Duration    : 3m 31s
+	CPU hours   : 0.1
+	Succeeded   : 12
 
-There is a project which enables to set up an API server and a simple user interface to upload files and retrieve results from this workflow. It can be useful for hosting this workflow: https://github.com/taltechnlp/est-asr-backend
 
-## Configuration
+The transcription result in different formats is put to te directory `result/some_audiofile` 
+(where `some_audiofile` corresponds to the "basename" of your input file):
 
-Nextflow enables various configuration options.
+	$ ls results/some_audiofile/
+	result.ctm  result.json  result.srt  result.trs  result.with-compounds.ctm
+
+### Running on cluster
+
+TODO
+
+### Running without Docker
+
+TODO
 
 ### Command line parameters
 
 Firstly, the main script (transcribe.nf) already has default values for input parameters. All of these can be provided via the command line when executing the script using the nextflow executable. To use a parameter, ignore the 'params.' part and prepend two hyphens '--'. So 'params.in' becomes '--in'. The following parameter should always be provided (unless the default value is satisfactory):
 
 -   --in - The name and location of the audio recording in you local system that needs to be transcribed.
--   --file_ext - the file extention of the input file. This does not have to exactly match the actual file name but is important to determine how to turn the file into ´wav´ format. Supported options: ´wav´, ´ḿp3´, ´mpga´, ´m4a´, ´mp4´, ´oga´, ´ogg´, ´mp2´, ´flac´.
--   --out - The output file name. Cannot be a location in the local system. Should be a unique file name. By default will be saved into the /results folder of this project.
--   --out_format - Output format of the transcription. By default `json`. Supported options: ´json´, ´trs´, ´with-compounds´, ´txt´, ´srt´.
 -   --do_speaker_id - By default 'yes'. Include speaker diarization and identification. The result will include speaker names. Some Estonian celebrities and radio personalities will be identified by their name, others will be give ID-s.
--   --do-punctuation - Whether to attempt punctuation recovery and add punctuation to the transcribed text.
+-   --do_punctuation - Whether to attempt punctuation recovery and add punctuation to the transcribed text.
+-   --do_language_id - Whether to apply a language ID model to discard speech segements that are not in Estonian
 
 ### Configuration file
 
 Additional configuration is currently provided via the nextflow.config file. The following parameters should be changed if you need advanced execution optimizations:
 
--   nthreads - By default the script will use 2 system threads. Should be changed in case you are executing the script in parallel multiple times or want to use a different nubmer of threads per execution.
+-   nthreads - By default the script will use 2 system threads for more CPU-intensive parts of the transcription pipeline. Should be changed in case you are executing the script in parallel multiple times or want to use a different nubmer of threads per execution.
 
-The rest of the parameters are there because the transcribe.nf file needs these. Those should never be changed unless you are deliberately changing the script (replacing the acoustic model or optimizing other parameters).
 
 Nextflow offers great support for cluster and cloud environments and Kubernetes. Please consult Nextflow documentation in order to configure these.
 
 ### Command line options
 
 Nextflow allows additional command line options:
 
--   -with-docker - Allows the script to use Docker. This should always be used with this script and must refer to the container that needed to be built locally.
--   -with-report - Generates a human-readable HTML execution report by default into the current folder. Optional, useful to dig into resource consumption details.
+-   with-report - Generates a human-readable HTML execution report by default into the current folder. Optional, useful to dig into resource consumption details.
 -   with-trace - Generates a machine-readable CSV execution report of all the steps in the script. Places it into the current folder by default. Useful to gather general execution statistics.
 -   with-dag "filename.png"- Generates a visual directed execution graph. Shows dependecies between script processes. Not very useful.
 -   with-weblog "your-api-endpoint" - Sends real-time statistics to the provided API endpoint. This is used by our https://github.com/taltechnlp/est-asr-backend backend server to gather real-time progress information and estimate queue length.
diff --git a/bin/align2ctm.py b/bin/align2ctm.py
@@ -0,0 +1,35 @@
+#! /usr/bin/env python
+
+import sys
+import argparse
+from subprocess import Popen, PIPE
+
+if __name__ == '__main__':
+
+  parser = argparse.ArgumentParser(description='Convert aligned output to CTM')
+  parser.add_argument('--frame-shift', default=0.01, type=float)
+  parser.add_argument('--unk-word', default="<unk>")
+  parser.add_argument('--unk-p2g-cmd', default="")
+  args = parser.parse_args()
+
+  unk_p2g_proc = None
+  if args.unk_p2g_cmd != "":
+    unk_p2g_proc = Popen(args.unk_p2g_cmd, shell=True, stdin=PIPE, stdout=PIPE)
+
+  for l in sys.stdin:
+    ss = l.split()
+    start_frame = int(ss[1])
+    num_frames = int(ss[2])
+    word = ss[4]
+    phones_str = " ".join(ss[5:])
+
+    if word == args.unk_word and unk_p2g_proc:
+      unk_p2g_proc.stdin.write((phones_str + "\n").encode('utf-8'))
+      unk_p2g_proc.stdin.flush()
+      word = unk_p2g_proc.stdout.readline().strip().decode('utf-8')
+      #word = "[%s]" % word
+
+    if word != "<eps>":
+      print("%s 1 %0.2f %0.2f %s" % (ss[0], start_frame * args.frame_shift, num_frames * args.frame_shift, word))
+
+
diff --git a/bin/classify-lang.py b/bin/classify-lang.py
@@ -0,0 +1,36 @@
+#! /usr/bin/env python
+import logging
+import sys
+import argparse
+import kaldiio
+import torch
+import pickle
+from speechbrain.pretrained import EncoderClassifier
+
+if __name__ == '__main__':
+  logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
+  parser = argparse.ArgumentParser(description="Apply LID to utterances")
+  parser.add_argument("dir")
+
+  args = parser.parse_args()
+
+  lid_svc = pickle.load(open("local/lid_clf.pkl", "rb"))
+
+  language_id = EncoderClassifier.from_hparams(source="TalTechNLP/voxlingua107-epaca-tdnn")
+
+  with kaldiio.ReadHelper(f'scp:{args.dir}/wav.scp', segments=f'{args.dir}/segments') as reader:
+    for key, (rate, numpy_array) in reader:
+      torch_array = torch.from_numpy(numpy_array)
+
+      prediction = language_id.classify_batch(torch_array)
+
+      emb = language_id.encode_batch(torch_array)
+
+      svc_result = lid_svc.predict([emb.squeeze().numpy()])
+      print(key, svc_result, prediction[3])
+
+      #breakpoint()
+      #top5 = prediction[0][0].argsort(descending=True)[0:5]
+      #lang_codes = [language_id.hparams.label_encoder.ind2lab[i.item()] for i in top5]
+      #scores = prediction[0][0][top5]
+      #print(key, " ".join([f"{l}:{s:0.2f}" for l, s in zip(lang_codes, scores)]))
diff --git a/bin/compile_models.sh b/bin/compile_models.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+# Begin configuration section.
+stage=1
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -z "$KALDI_ROOT" ]; then
+  echo "Please set KALDI_ROOT"
+  exit 1;
+fi
+
+rm -f steps utils sid rnnlm
+ln -s $KALDI_ROOT/egs/wsj/s5/steps
+ln -s $KALDI_ROOT/egs/wsj/s5/utils
+ln -s $KALDI_ROOT/egs/sre08/v1/sid
+ln -s $KALDI_ROOT/scripts/rnnlm
+
+
+am=cnn_tdnn_1d_online
+lexicon=language_model/lexicon.txt
+pruned_lm=language_model/interpolated.pruned9.4g.arpa.gz
+rnnlm_dir=language_model/rnnlm
+compounder_lm=language_model/compounder.pruned9.4g.arpa.gz
+
+. ./utils/parse_options.sh
+
+export PATH=${KALDI_ROOT}/tools/openfst/bin:utils:${KALDI_ROOT}/src/bin:${KALDI_ROOT}/tools/openfst/bin:${KALDI_ROOT}/src/fstbin/:${KALDI_ROOT}/src/lmbin/:${KALDI_ROOT}/src/rnnlmbin/:$PATH
+
+set -e
+set -u
+set -o pipefail
+
+if [ $stage -le 1 ]; then
+    rm -rf build/fst/${am}
+    mkdir -p build/fst/${am}
+    cp -r kaldi-data/${am}/* build/fst/${am}
+    perl -i -npe 's#=.*online/#=build/fst/${am}/#' build/fst/${am}/conf/*.conf
+    if [ ! -e build/fst/${am}/cmvn_opts ]; then \
+        echo "--norm-means=false --norm-vars=false" > build/fst/${am}/cmvn_opts; \
+    fi
+fi
+
+
+if [ $stage -le 2 ]; then
+  # Make a lexicon from the user-provided lexicon
+    rm -rf build/fst/data/dict
+    mkdir -p build/fst/data/dict
+    cp -r kaldi-data/dict/* build/fst/data/dict
+    rm -f build/fst/data/dict/lexicon.txt build/fst/data/dict/lexiconp.txt
+    cat kaldi-data/dict/lexicon.txt  | egrep "^<" > build/fst/data/dict/lexicon.txt  
+    cat $lexicon | perl -npe 's/\(\d\)(\s)/\1/' >> build/fst/data/dict/lexicon.txt  
+fi
+
+
+if [ $stage -le 3 ]; then
+    echo "Constructing UNK word LM"
+    rm -rf build/fst/data/unk_lang_model
+    utils/lang/make_unk_lm.sh --cmd utils/run.pl build/fst/data/dict build/fst/data/unk_lang_model
+fi
+
+if [ $stage -le 4 ]; then
+    echo "Making build/fst/data/prunedlm/G.fst from the provided ARPA LM"
+    rm -rf build/fst/data/prunedlm
+    mkdir -p build/fst/data/prunedlm
+    utils/prepare_lang.sh --phone-symbol-table build/fst/$am/phones.txt build/fst/data/dict '<unk>' build/fst/data/dict/tmp build/fst/data/prunedlm
+    gunzip -c $pruned_lm | arpa2fst --disambig-symbol=#0 \
+        --read-symbol-table=build/fst/data/prunedlm/words.txt - build/fst/data/prunedlm/G.fst
+    echo "Checking how stochastic G is (the first of these numbers should be small):"
+    fstisstochastic build/fst/data/prunedlm/G.fst || echo "not stochastic (probably OK)"    
+    utils/validate_lang.pl build/fst/data/prunedlm || exit 1
+fi
+
+
+if [ $stage -le 5 ]; then
+    echo "Making build/fst/data/prunedlm_unk from build/fst/data/prunedlm"
+    rm -rf build/fst/data/prunedlm_unk
+    utils/prepare_lang.sh --unk-fst build/fst/data/unk_lang_model/unk_fst.txt build/fst/data/dict "<unk>" build/fst/data/prunedlm build/fst/data/prunedlm_unk
+    cp build/fst/data/prunedlm/G.fst build/fst/data/prunedlm_unk
+fi
+
+
+if [ $stage -le 6 ]; then
+    echo "Compiling decoding graph"
+    rm -rf build/fst/build/fst/${am}/graph_prunedlm_unk
+    self_loop_scale_arg=""
+    if [ -f build/fst/${am}/frame_subsampling_factor ]; then
+      factor=`cat build/fst/${am}/frame_subsampling_factor`
+      if [ $factor -eq "3" ]; then
+        self_loop_scale_arg="--self-loop-scale 1.0 "
+      fi
+    fi
+    utils/mkgraph.sh $self_loop_scale_arg build/fst/data/prunedlm_unk build/fst/${am} build/fst/${am}/graph_prunedlm_unk
+    rm -rf build/fst/data/prunedlm_unk/tmp
+fi
+
+
+if [ $stage -le 7 ]; then
+    echo "Preparing RNNLM"
+    rm -rf build/fst/data/rnnlm_unk
+    cp $rnnlm_dir/config/unigram_probs.txt $rnnlm_dir/unigram_probs.txt
+    rnnlm/change_vocab.sh build/fst/data/prunedlm/words.txt \
+        $rnnlm_dir build/fst/data/rnnlm_unk
+
+fi    
+
+
+if [ $stage -le 8 ]; then
+    echo "Converting compounder LM to FST"
+    rm -rf build/fst/data/compounderlm
+    mkdir -p build/fst/data/compounderlm
+    cat $lexicon | perl -npe 's/(\(\d\))?\s.+//' | uniq | ./bin/make-compounder-symbols.py > build/fst/data/compounderlm/words.txt
+    zcat $compounder_lm | \
+        grep -v '<s> <s>' | \
+        grep -v '</s> <s>' | \
+        grep -v '</s> </s>' | \
+        arpa2fst --disambig-symbol='#0' --read-symbol-table=build/fst/data/compounderlm/words.txt -  | fstproject --project_output=true | fstarcsort --sort_type=ilabel > build/fst/data/compounderlm/G.fst 
+fi