-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
37 changed files
with
1,922 additions
and
436 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,99 @@ | ||
FROM alumae/kaldi-offline-transcriber-et | ||
FROM kaldiasr/kaldi:latest | ||
MAINTAINER Tanel Alumae <alumae@gmail.com> | ||
|
||
LABEL maintainer="Aivo Olev" | ||
RUN apt-get update && apt-get install -y \ | ||
autoconf \ | ||
automake \ | ||
bzip2 \ | ||
g++ \ | ||
gfortran \ | ||
git \ | ||
libatlas3-base \ | ||
libtool-bin \ | ||
make \ | ||
python2.7 \ | ||
python-pip \ | ||
python-dev \ | ||
sox \ | ||
ffmpeg \ | ||
subversion \ | ||
wget \ | ||
zlib1g-dev && \ | ||
apt-get clean autoclean && \ | ||
apt-get autoremove -y | ||
|
||
COPY scripts/diarization.sh /opt/kaldi-offline-transcriber/scripts/diarization.sh | ||
COPY .gitignore /opt/kaldi-offline-transcriber/.gitignore | ||
COPY transcribe.nf /opt/kaldi-offline-transcriber/ | ||
|
||
RUN apt-get update && apt-get install -y procps | ||
ENV PATH="/root/miniconda3/bin:${PATH}" | ||
ARG PATH="/root/miniconda3/bin:${PATH}" | ||
|
||
CMD ["/bin/bash"] | ||
RUN wget \ | ||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ | ||
&& mkdir /root/.conda \ | ||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \ | ||
&& rm -f Miniconda3-latest-Linux-x86_64.sh | ||
|
||
RUN conda --version | ||
|
||
RUN conda install -c conda-forge pynini=2.1.3 | ||
|
||
RUN conda install pytorch=1.8.1 torchvision torchaudio=0.8.1 cpuonly -c pytorch | ||
|
||
RUN conda install ruamel.yaml && \ | ||
pip install kaldiio && \ | ||
pip install simplejson && \ | ||
pip install pytest | ||
|
||
RUN pip install speechbrain | ||
|
||
WORKDIR /opt | ||
|
||
RUN git clone https://github.com/alumae/et-g2p-fst.git | ||
|
||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y locales | ||
|
||
RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen && \ | ||
dpkg-reconfigure --frontend=noninteractive locales && \ | ||
update-locale LANG=en_US.UTF-8 | ||
|
||
ENV LANG en_US.UTF-8 | ||
|
||
RUN apt-get install -y openjdk-8-jre-headless | ||
|
||
RUN cd /opt/kaldi/tools && \ | ||
extras/install_pocolm.sh | ||
|
||
ENV HOME /opt | ||
ENV LD_LIBRARY_PATH /usr/local/lib | ||
|
||
RUN ln -s -f /usr/bin/python2 /usr/bin/python && \ | ||
apt-get install -y python-numpy python-scipy python3-simplejson python3-pytest && \ | ||
pip2 install theano --no-deps | ||
|
||
# Set up punctuator | ||
RUN mkdir -p /opt/est-asr-pipeline && \ | ||
cd /opt/est-asr-pipeline && \ | ||
wget -q -O - http://bark.phon.ioc.ee/tanel/est_punct2.tar.gz | tar xvz | ||
|
||
RUN cd /opt/est-asr-pipeline && \ | ||
wget -q -O - http://bark.phon.ioc.ee/tanel/kaldi-offline-transcriber-data-2021-06-11.tgz | tar xvz | ||
|
||
|
||
COPY bin /opt/est-asr-pipeline/bin | ||
|
||
ENV KALDI_ROOT /opt/kaldi | ||
|
||
RUN cd /opt/est-asr-pipeline && \ | ||
touch -m path.sh && \ | ||
./bin/compile_models.sh | ||
|
||
# This can be removed once the base data pack has been fixed | ||
RUN echo '--sample-frequency=16000' > /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \ | ||
echo '--frame-length=25' >> /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \ | ||
echo '--low-freq=20' >> /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \ | ||
echo '--high-freq=7600' >> /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \ | ||
echo '--num-mel-bins=30' >> /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \ | ||
echo '--num-ceps=24' >> /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf && \ | ||
echo '--snip-edges=false' >> /opt/est-asr-pipeline/kaldi-data/sid/mfcc_sid.conf | ||
|
||
|
||
CMD ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#! /usr/bin/env python | ||
|
||
import sys | ||
import argparse | ||
from subprocess import Popen, PIPE | ||
|
||
if __name__ == '__main__': | ||
|
||
parser = argparse.ArgumentParser(description='Convert aligned output to CTM') | ||
parser.add_argument('--frame-shift', default=0.01, type=float) | ||
parser.add_argument('--unk-word', default="<unk>") | ||
parser.add_argument('--unk-p2g-cmd', default="") | ||
args = parser.parse_args() | ||
|
||
unk_p2g_proc = None | ||
if args.unk_p2g_cmd != "": | ||
unk_p2g_proc = Popen(args.unk_p2g_cmd, shell=True, stdin=PIPE, stdout=PIPE) | ||
|
||
for l in sys.stdin: | ||
ss = l.split() | ||
start_frame = int(ss[1]) | ||
num_frames = int(ss[2]) | ||
word = ss[4] | ||
phones_str = " ".join(ss[5:]) | ||
|
||
if word == args.unk_word and unk_p2g_proc: | ||
unk_p2g_proc.stdin.write((phones_str + "\n").encode('utf-8')) | ||
unk_p2g_proc.stdin.flush() | ||
word = unk_p2g_proc.stdout.readline().strip().decode('utf-8') | ||
#word = "[%s]" % word | ||
|
||
if word != "<eps>": | ||
print("%s 1 %0.2f %0.2f %s" % (ss[0], start_frame * args.frame_shift, num_frames * args.frame_shift, word)) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#! /usr/bin/env python | ||
import logging | ||
import sys | ||
import argparse | ||
import kaldiio | ||
import torch | ||
import pickle | ||
from speechbrain.pretrained import EncoderClassifier | ||
|
||
if __name__ == '__main__': | ||
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) | ||
parser = argparse.ArgumentParser(description="Apply LID to utterances") | ||
parser.add_argument("dir") | ||
|
||
args = parser.parse_args() | ||
|
||
lid_svc = pickle.load(open("local/lid_clf.pkl", "rb")) | ||
|
||
language_id = EncoderClassifier.from_hparams(source="TalTechNLP/voxlingua107-epaca-tdnn") | ||
|
||
with kaldiio.ReadHelper(f'scp:{args.dir}/wav.scp', segments=f'{args.dir}/segments') as reader: | ||
for key, (rate, numpy_array) in reader: | ||
torch_array = torch.from_numpy(numpy_array) | ||
|
||
prediction = language_id.classify_batch(torch_array) | ||
|
||
emb = language_id.encode_batch(torch_array) | ||
|
||
svc_result = lid_svc.predict([emb.squeeze().numpy()]) | ||
print(key, svc_result, prediction[3]) | ||
|
||
#breakpoint() | ||
#top5 = prediction[0][0].argsort(descending=True)[0:5] | ||
#lang_codes = [language_id.hparams.label_encoder.ind2lab[i.item()] for i in top5] | ||
#scores = prediction[0][0][top5] | ||
#print(key, " ".join([f"{l}:{s:0.2f}" for l, s in zip(lang_codes, scores)])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
#!/bin/bash | ||
|
||
# Begin configuration section. | ||
stage=1 | ||
|
||
echo "$0 $@" # Print the command line for logging | ||
|
||
if [ -z "$KALDI_ROOT" ]; then | ||
echo "Please set KALDI_ROOT" | ||
exit 1; | ||
fi | ||
|
||
rm -f steps utils sid rnnlm | ||
ln -s $KALDI_ROOT/egs/wsj/s5/steps | ||
ln -s $KALDI_ROOT/egs/wsj/s5/utils | ||
ln -s $KALDI_ROOT/egs/sre08/v1/sid | ||
ln -s $KALDI_ROOT/scripts/rnnlm | ||
|
||
|
||
am=cnn_tdnn_1d_online | ||
lexicon=language_model/lexicon.txt | ||
pruned_lm=language_model/interpolated.pruned9.4g.arpa.gz | ||
rnnlm_dir=language_model/rnnlm | ||
compounder_lm=language_model/compounder.pruned9.4g.arpa.gz | ||
|
||
. ./utils/parse_options.sh | ||
|
||
export PATH=${KALDI_ROOT}/tools/openfst/bin:utils:${KALDI_ROOT}/src/bin:${KALDI_ROOT}/tools/openfst/bin:${KALDI_ROOT}/src/fstbin/:${KALDI_ROOT}/src/lmbin/:${KALDI_ROOT}/src/rnnlmbin/:$PATH | ||
|
||
set -e | ||
set -u | ||
set -o pipefail | ||
|
||
if [ $stage -le 1 ]; then | ||
rm -rf build/fst/${am} | ||
mkdir -p build/fst/${am} | ||
cp -r kaldi-data/${am}/* build/fst/${am} | ||
perl -i -npe 's#=.*online/#=build/fst/${am}/#' build/fst/${am}/conf/*.conf | ||
if [ ! -e build/fst/${am}/cmvn_opts ]; then \ | ||
echo "--norm-means=false --norm-vars=false" > build/fst/${am}/cmvn_opts; \ | ||
fi | ||
fi | ||
|
||
|
||
if [ $stage -le 2 ]; then | ||
# Make a lexicon from the user-provided lexicon | ||
rm -rf build/fst/data/dict | ||
mkdir -p build/fst/data/dict | ||
cp -r kaldi-data/dict/* build/fst/data/dict | ||
rm -f build/fst/data/dict/lexicon.txt build/fst/data/dict/lexiconp.txt | ||
cat kaldi-data/dict/lexicon.txt | egrep "^<" > build/fst/data/dict/lexicon.txt | ||
cat $lexicon | perl -npe 's/\(\d\)(\s)/\1/' >> build/fst/data/dict/lexicon.txt | ||
fi | ||
|
||
|
||
if [ $stage -le 3 ]; then | ||
echo "Constructing UNK word LM" | ||
rm -rf build/fst/data/unk_lang_model | ||
utils/lang/make_unk_lm.sh --cmd utils/run.pl build/fst/data/dict build/fst/data/unk_lang_model | ||
fi | ||
|
||
if [ $stage -le 4 ]; then | ||
echo "Making build/fst/data/prunedlm/G.fst from the provided ARPA LM" | ||
rm -rf build/fst/data/prunedlm | ||
mkdir -p build/fst/data/prunedlm | ||
utils/prepare_lang.sh --phone-symbol-table build/fst/$am/phones.txt build/fst/data/dict '<unk>' build/fst/data/dict/tmp build/fst/data/prunedlm | ||
gunzip -c $pruned_lm | arpa2fst --disambig-symbol=#0 \ | ||
--read-symbol-table=build/fst/data/prunedlm/words.txt - build/fst/data/prunedlm/G.fst | ||
echo "Checking how stochastic G is (the first of these numbers should be small):" | ||
fstisstochastic build/fst/data/prunedlm/G.fst || echo "not stochastic (probably OK)" | ||
utils/validate_lang.pl build/fst/data/prunedlm || exit 1 | ||
fi | ||
|
||
|
||
if [ $stage -le 5 ]; then | ||
echo "Making build/fst/data/prunedlm_unk from build/fst/data/prunedlm" | ||
rm -rf build/fst/data/prunedlm_unk | ||
utils/prepare_lang.sh --unk-fst build/fst/data/unk_lang_model/unk_fst.txt build/fst/data/dict "<unk>" build/fst/data/prunedlm build/fst/data/prunedlm_unk | ||
cp build/fst/data/prunedlm/G.fst build/fst/data/prunedlm_unk | ||
fi | ||
|
||
|
||
if [ $stage -le 6 ]; then | ||
echo "Compiling decoding graph" | ||
rm -rf build/fst/build/fst/${am}/graph_prunedlm_unk | ||
self_loop_scale_arg="" | ||
if [ -f build/fst/${am}/frame_subsampling_factor ]; then | ||
factor=`cat build/fst/${am}/frame_subsampling_factor` | ||
if [ $factor -eq "3" ]; then | ||
self_loop_scale_arg="--self-loop-scale 1.0 " | ||
fi | ||
fi | ||
utils/mkgraph.sh $self_loop_scale_arg build/fst/data/prunedlm_unk build/fst/${am} build/fst/${am}/graph_prunedlm_unk | ||
rm -rf build/fst/data/prunedlm_unk/tmp | ||
fi | ||
|
||
|
||
if [ $stage -le 7 ]; then | ||
echo "Preparing RNNLM" | ||
rm -rf build/fst/data/rnnlm_unk | ||
cp $rnnlm_dir/config/unigram_probs.txt $rnnlm_dir/unigram_probs.txt | ||
rnnlm/change_vocab.sh build/fst/data/prunedlm/words.txt \ | ||
$rnnlm_dir build/fst/data/rnnlm_unk | ||
|
||
fi | ||
|
||
|
||
if [ $stage -le 8 ]; then | ||
echo "Converting compounder LM to FST" | ||
rm -rf build/fst/data/compounderlm | ||
mkdir -p build/fst/data/compounderlm | ||
cat $lexicon | perl -npe 's/(\(\d\))?\s.+//' | uniq | ./bin/make-compounder-symbols.py > build/fst/data/compounderlm/words.txt | ||
zcat $compounder_lm | \ | ||
grep -v '<s> <s>' | \ | ||
grep -v '</s> <s>' | \ | ||
grep -v '</s> </s>' | \ | ||
arpa2fst --disambig-symbol='#0' --read-symbol-table=build/fst/data/compounderlm/words.txt - | fstproject --project_output=true | fstarcsort --sort_type=ilabel > build/fst/data/compounderlm/G.fst | ||
fi |
Oops, something went wrong.