Skip to content

Speech Recognition -- Wav2vec2 with Transducer Decoder #124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions egs/librispeech/v1/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.

if [ "$(hostname -d)" == "cm.gemini" ];then
export train_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
export cuda_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 20G"
#export cuda_cmd="queue.pl --config conf/coe_gpu_v100.conf --mem 20G"
export cuda_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 40G"
export cuda_eval_cmd="queue.pl --config conf/coe_gpu_short.conf --mem 4G"
#export cuda_eval_cmd="queue.pl --config conf/coe_gpu_rtx.conf --mem 10G"
#export cuda_eval_cmd="queue.pl --config conf/coe_gpu_long.conf --mem 4G"
else
export train_cmd="queue.pl --mem 4G -l hostname=\"[bc][01]*\" -V"
export cuda_cmd="queue.pl --mem 20G -l hostname=\"c[01]*\" -V"
export cuda_eval_cmd="$train_cmd"
fi



11 changes: 11 additions & 0 deletions egs/librispeech/v1/conf/clsp.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@

# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64* -V
option mem=* -l mem_free=$0,ram_free=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1 # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
default gpu=0
option gpu=0 -l 'hostname=b[1]*|c0[123456789]*|c1[134679]*|c2[1357]*'
option gpu=* -l 'hostname=c0[123456789]*|c1[1345679]*|c2[12357]*,gpu=$0'
35 changes: 35 additions & 0 deletions egs/librispeech/v1/conf/reverb_noise_aug.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
reverb_aug:
reverb_prob: 0.45
max_reverb_context: 0.5
rir_types:
smallroom:
weight: 1
rir_path: scp:data/rirs_smallroom/rirs.scp
rir_norm: max
mediumroom:
weight: 1
rir_path: scp:data/rirs_mediumroom/rirs.scp
rir_norm: max
realroom:
weight: 1
rir_path: scp:data/rirs_real/rirs.scp
rir_norm: max
noise_aug:
noise_prob: 0.7
noise_types:
noise:
weight: 1
noise_path: data/musan_noise_proc_audio/wav.scp
min_snr: 0
max_snr: 18
music:
weight: 1
noise_path: data/musan_music_proc_audio/wav.scp
min_snr: 3
max_snr: 18
babble:
weight: 1
noise_path: data/musan_speech_babble/wav.scp
min_snr: 3
max_snr: 18

Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
data:
train:
dataset:
wav_scale: 1
aug_cfgs:
- conf/reverb_noise_aug.yaml
return_segment_info:
- text
sampler:
sampler_type: 'seg_sampler'
# sampler_type: 'bucketing_seg_sampler'
min_batch_size: 4
batch_size: 4
iters_per_epoch: 6
drop_last: true
data_loader:
num_workers: 8
val:
dataset:
aug_cfgs:
- conf/reverb_noise_aug.yaml
wav_scale: 1
return_segment_info:
- text
sampler:
sampler_type: 'seg_sampler'
# sampler_type: 'bucketing_seg_sampler'
min_batch_size: 2
batch_size: 2
iters_per_epoch: 6
drop_last: true
data_loader:
num_workers: 8
model: wav2vec2xlsr300m_transducer.yaml
trainer:
optim:
opt_type: sgd
lr: 0.003
momentum: 0.9
weight_decay: 4e-4
lrsched:
lrsch_type: exp_lr
decay_rate: 0.5
decay_steps: 4200
hold_steps: 1500
min_lr: 4e-5
warmup_steps: 1500
update_lr_on_opt_step: true
grad_clip: 100
use_amp: true
log_interval: 1000
epochs: 60
eff_batch_size: 1024
train_mode: hf-feats-frozen-nograd


11 changes: 11 additions & 0 deletions egs/librispeech/v1/conf/wav2vec2xlsr300m_transducer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
hf_feats:
pretrained_model_path: facebook/wav2vec2-base-960h #microsoft/wavlm-base #facebook/wav2vec2-base #microsoft/wavlm-base-plus
transducer:
decoder:
embedding_dim: 1024
num_layers: 2
hidden_dim: 512
joiner:
num_layers: 1
feat_fusion_method: weighted-avg
feat_fusion_start: 2
22 changes: 22 additions & 0 deletions egs/librispeech/v1/datapath.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Copyright
# 2018 Johns Hopkins University (Author: Jesus Villalba)
#
# Paths to the databases used in the experiment


if [ "$(hostname --domain)" == "clsp.jhu.edu" ];then
librispeech_root=/export/corpora5/LibriSpeech
musan_root=/export/corpora5/JHU/musan
elif [ "$(hostname --domain)" == "cm.gemini" ];then
# voxceleb1_root=/expscratch/dsnyder/VoxCeleb1 #voxceleb1 v1
# voxceleb1_root=/exp/jvillalba/corpora/voxceleb1 #voxceleb1 v2
# voxceleb2_root=/expscratch/dgromero/corpora-open/vox2
# musan_root=/expscratch/dgromero/corpora-open/musan
echo "Put your database paths here"
exit 1
else
echo "Put your database paths here"
exit 1
fi


1 change: 1 addition & 0 deletions egs/librispeech/v1/default_config.sh
1 change: 1 addition & 0 deletions egs/librispeech/v1/feats
39 changes: 39 additions & 0 deletions egs/librispeech/v1/global_conf/config_transducer_v1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# WavLM base trained on 60k LibriLight + 10k GigaSpeech + 24k Voxpopuli + ECAPA-TDNN 512x3

# hugging face model
hf_model_name=wav2vec2xlsr300m

#vad
# vad_config=conf/vad_16k.yaml

# x-vector training
nnet_data=train_clean_100
dev_data=dev_clean
# nnet_data=train_clean_small

bpe_model=data/lang_bpe_1000/bpe.model
# x-vector cfg

nnet_type=hf_wav2vec2transducer

nnet_s1_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
nnet_s1_args=""

nnet_name=${hf_model_name}_transducer_v1.0
nnet_s1_name=$nnet_name.s1

nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
nnet_s1=$nnet_s1_dir/model_ep0060.pth

nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
nnet_s2_args=""
nnet_s2_name=${nnet_name}.s2
nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
nnet_s2=$nnet_s2_dir/model_ep0020.pth

nnet_s3_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
nnet_s3_args=""
nnet_s3_name=${nnet_name}.s3
nnet_s3_dir=exp/transducer_nnets/$nnet_s3_name
nnet_s3=$nnet_s3_dir/model_ep0002.pth
nnet_s3=$nnet_s3_dir/model_ep0005.pth
1 change: 1 addition & 0 deletions egs/librispeech/v1/hyp_utils
88 changes: 88 additions & 0 deletions egs/librispeech/v1/local/data_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env bash

# Copyright 2014 Vassil Panayotov
# 2014 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0

if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
exit 1
fi

src=$1
dst=$2

# all utterances are FLAC compressed
if ! which flac >&/dev/null; then
echo "Please install 'flac' on ALL worker nodes!"
exit 1
fi

spk_file=$src/../SPEAKERS.TXT

mkdir -p $dst || exit 1

[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1


wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender

for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
reader=$(basename $reader_dir)
if ! [ $reader -eq $reader ]; then # not integer.
echo "$0: unexpected subdirectory name $reader"
exit 1
fi

reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}')
if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then
echo "Unexpected gender: '$reader_gender'"
exit 1
fi

for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
chapter=$(basename $chapter_dir)
if ! [ "$chapter" -eq "$chapter" ]; then
echo "$0: unexpected chapter-subdirectory name $chapter"
exit 1
fi

find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1

chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
[ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
cat $chapter_trans >>$trans

# NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered
# to be a different speaker. This is done for simplicity and because we want
# e.g. the CMVN to be calculated per-chapter
awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \
<$chapter_trans >>$utt2spk || exit 1

# reader -> gender map (again using per-chapter granularity)
echo "${reader}-${chapter} $reader_gender" >>$spk2gender
done
done

spk2utt=$dst/spk2utt
utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1
utils/data/get_utt2dur.sh $dst
awk 'sub(/ *$/, "", $0)' $dst/utt2dur > $dst/utt2dur2
mv $dst/utt2dur2 $dst/utt2dur

ntrans=$(wc -l <$trans)
nutt2spk=$(wc -l <$utt2spk)
! [ "$ntrans" -eq "$nutt2spk" ] && \
echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1

utils/validate_data_dir.sh --no-feats $dst || exit 1

echo "$0: successfully prepared data in $dst"

exit 0
Loading