Skip to content

Decode and Fine-tune for Wav2vec2 Transducer ASR #128

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
data:
train:
dataset:
wav_scale: 1
aug_cfgs:
- conf/reverb_noise_aug.yaml
return_segment_info:
- text
sampler:
#sampler_type: 'seg_sampler'
sampler_type: 'bucketing_seg_sampler'
max_batch_length: 85.
min_batch_size: 1
drop_last: false
data_loader:
num_workers: 4
val:
dataset:
aug_cfgs:
- conf/reverb_noise_aug.yaml
wav_scale: 1
return_segment_info:
- text
sampler:
#sampler_type: 'seg_sampler'
sampler_type: 'bucketing_seg_sampler'
max_batch_length: 30
min_batch_size: 1
drop_last: true
data_loader:
num_workers: 4
model:
transducer:
decoder:
override_dropouts: true
embedding_dropout_rate: 0.3
rnn_dropout_rate: 0.3

trainer:
optim:
opt_type: sgd
lr: 0.003
momentum: 0.9
weight_decay: 4e-4
lrsched:
lrsch_type: exp_lr
decay_rate: 0.5
decay_steps: 4200
hold_steps: 1500
min_lr: 4e-5
warmup_steps: 1500
update_lr_on_opt_step: true
grad_clip: 100
use_amp: true
log_interval: 1000
epochs: 120
# eff_batch_size: 1024
eff_batch_size: 128
train_mode: hf-feats-frozen-nograd


2 changes: 1 addition & 1 deletion egs/librispeech/v1/global_conf/config_transducer_v1.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ hf_model_name=wav2vec2xlsr300m
# x-vector training
nnet_data=train_clean_100
dev_data=dev_clean
# nnet_data=train_clean_small
test_data=test_clean

bpe_model=data/lang_bpe_1000/bpe.model
# x-vector cfg
Expand Down
4 changes: 2 additions & 2 deletions egs/librispeech/v1/global_conf/config_transducer_v3.2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ nnet_name=${hf_model_name}_transducer_v3.2
nnet_s1_name=$nnet_name.s1

nnet_s1_dir=exp/transducer_nnets/$nnet_s1_name
nnet_s1=$nnet_s1_dir/model_ep0060.pth
nnet_s1=$nnet_s1_dir/model_ep0120.pth

nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage1_v1.0.yaml
nnet_s2_base_cfg=conf/train_wav2vec2xlsr300m_transducer_stage2_v3.2.yaml
nnet_s2_args=""
nnet_s2_name=${nnet_name}.s2
nnet_s2_dir=exp/transducer_nnets/$nnet_s2_name
Expand Down
50 changes: 27 additions & 23 deletions egs/librispeech/v1/run_011_train_asr.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash
# Copyright
# 2019 Johns Hopkins University (Author: Jesus Villalba)
# 2022 Johns Hopkins University (Author: Yen-Ju Lu)
# Apache 2.0.
#
. ./cmd.sh
Expand Down Expand Up @@ -68,23 +68,25 @@ if [ $stage -le 2 ]; then
if [ "$use_wandb" == "true" ];then
extra_args="$extra_args --trainer.wandb.name $nnet_s2_name.$(date -Iminutes)"
fi

mkdir -p $nnet_s2_dir/log
$cuda_cmd \
--gpu $ngpu $nnet_s2_dir/log/train.log \
hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
finetune_wav2vec2xvector.py $nnet_type \
finetune_wav2vec2transducer.py $nnet_type \
--cfg $nnet_s2_base_cfg $nnet_s2_args $extra_args \
--data.train.dataset.audio-file $list_dir/wav.scp \
--data.train.dataset.time-durs-file $list_dir/utt2dur \
--data.train.dataset.key-file $list_dir/lists_xvec/train.scp \
--data.train.dataset.class-file $list_dir/lists_xvec/class2int \
--data.val.dataset.audio-file $list_dir/wav.scp \
--data.val.dataset.time-durs-file $list_dir/utt2dur \
--data.val.dataset.key-file $list_dir/lists_xvec/val.scp \
--in-model-file $nnet_s1 \
--data.train.dataset.audio-file $train_dir/wav.scp \
--data.train.dataset.segments-file $train_dir/utt2spk \
--data.train.dataset.bpe-model $bpe_model \
--data.train.dataset.text-file $train_dir/text \
--data.val.dataset.audio-file $val_dir/wav.scp \
--data.val.dataset.segments-file $val_dir/utt2spk \
--data.val.dataset.text-file $val_dir/text \
--trainer.exp-path $nnet_s2_dir $args \
--num-gpus $ngpu \
--in-model-file $nnet_s1 \
--data.train.dataset.time-durs-file $train_dir/utt2dur \
--data.val.dataset.time-durs-file $val_dir/utt2dur \
--num-gpus $ngpu

fi

Expand All @@ -94,22 +96,24 @@ if [ $stage -le 3 ]; then
extra_args="$extra_args --trainer.wandb.name $nnet_s3_name.$(date -Iminutes)"
fi


mkdir -p $nnet_s3_dir/log
$cuda_cmd \
--gpu $ngpu $nnet_s3_dir/log/train.log \
hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
finetune_wav2vec2xvector.py $nnet_type \
finetune_wav2vec2transducer.py $nnet_type \
--cfg $nnet_s3_base_cfg $nnet_s3_args $extra_args \
--data.train.dataset.audio-file $list_dir/wav.scp \
--data.train.dataset.time-durs-file $list_dir/utt2dur \
--data.train.dataset.key-file $list_dir/lists_xvec/train.scp \
--data.train.dataset.class-file $list_dir/lists_xvec/class2int \
--data.val.dataset.audio-file $list_dir/wav.scp \
--data.val.dataset.time-durs-file $list_dir/utt2dur \
--data.val.dataset.key-file $list_dir/lists_xvec/val.scp \
--in-model-file $nnet_s2 \
--data.train.dataset.audio-file $train_dir/wav.scp \
--data.train.dataset.segments-file $train_dir/utt2spk \
--data.train.dataset.bpe-model $bpe_model \
--data.train.dataset.text-file $train_dir/text \
--data.val.dataset.audio-file $val_dir/wav.scp \
--data.val.dataset.segments-file $val_dir/utt2spk \
--data.val.dataset.text-file $val_dir/text \
--trainer.exp-path $nnet_s3_dir $args \
--num-gpus $ngpu \

--in-model-file $nnet_s2 \
--data.train.dataset.time-durs-file $train_dir/utt2dur \
--data.val.dataset.time-durs-file $val_dir/utt2dur \
--num-gpus $ngpu
fi

54 changes: 14 additions & 40 deletions egs/librispeech/v1/run_030_inference.sh
Original file line number Diff line number Diff line change
@@ -1,26 +1,23 @@
#!/bin/bash
# Copyright
# 2020 Johns Hopkins University (Author: Jesus Villalba)
# 2022 Johns Hopkins University (Author: Yen-Ju Lu)
# Apache 2.0.
#
. ./cmd.sh
. ./path.sh
set -e

stage=2
config_file=default_config.sh
use_gpu=false
nnet_stage=3
hf_chunk_length=120 #seconds
xvec_chunk_length=120 #seconds
nnet_stage=1
. parse_options.sh || exit 1;
. $config_file

if [ "$use_gpu" == "true" ];then
xvec_args="--use-gpu true --xvec-chunk-length $xvec_chunk_length --hf-chunk-length $hf_chunk_length"
xvec_cmd="$cuda_eval_cmd --mem 6G"
transducer_args="--use-gpu true"
transducer_cmd="$cuda_eval_cmd --mem 6G"
else
xvec_cmd="$train_cmd --mem 12G"
transducer_cmd="$train_cmd --mem 12G"
fi

if [ $nnet_stage -eq 1 ];then
Expand All @@ -34,41 +31,18 @@ elif [ $nnet_stage -eq 3 ];then
nnet_name=$nnet_s3_name
fi

xvector_dir=exp/xvectors/$nnet_name
transducer_dir=exp/transducer/$nnet_name

if [ $stage -le 1 ]; then
# Extract xvectors for training LDA/PLDA
for name in voxceleb2cat_train
do
if [ $plda_num_augs -eq 0 ]; then
steps_xvec/extract_wav2vec2xvectors.sh \
--cmd "$xvec_cmd" --nj 100 ${xvec_args} \
--random-utt-length true --min-utt-length 4 --max-utt-length 140 \
$nnet data/${name} \
$xvector_dir/${name}
else
steps_xvec/extract_wav2vec2xvectors.sh \
--cmd "$xvec_cmd" --nj 300 ${xvec_args} \
--random-utt-length true --min-utt-length 4 --max-utt-length 140 \
--aug-config $plda_aug_config --num-augs $plda_num_augs \
$nnet data/${name} \
$xvector_dir/${name}_augx${plda_num_augs} \
data/${name}_augx${plda_num_augs}
fi
done
fi

if [ $stage -le 2 ]; then
# Extracts x-vectors for evaluation
for name in voxceleb1_test
test_data=test_clean


# Extracts x-vectors for evaluation
for name in $test_data
do
num_spk=$(wc -l data/$name/spk2utt | awk '{ print $1}')
nj=$(($num_spk < 100 ? $num_spk:100))
steps_xvec/extract_wav2vec2xvectors.sh \
--cmd "$xvec_cmd" --nj $nj ${xvec_args} \
nj=16
steps_transducer/decode_wav2vec2transducer.sh --cmd "$transducer_cmd --mem 12G" --nj $nj ${transducer_args} \
$nnet data/$name \
$xvector_dir/$name
$transducer_dir/$name $bpe_model
done
fi

exit
80 changes: 80 additions & 0 deletions egs/librispeech/v1/steps_transducer/decode_wav2vec2transducer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/bin/bash
# 2022 Johns Hopkins University (Author: Yen-Ju Lu)
# Apache 2.0.
nj=30
cmd="run.pl"

use_gpu=false
write_utt2num_frames=true # If true writes utt2num_frames.
stage=0
num_augs=0

echo "$0 $@" # Print the command line for logging

if [ -f path.sh ]; then . ./path.sh; fi
. parse_options.sh || exit 1;

if [ $# != 3 ] && [ $# != 4 ]; then
echo "Usage: $0 [options] <nnet-model> <data> <xvector-dir> [<data-out-dir>]"
echo " e.g.: $0 --feat-config conf/fbank_mvn.yml --aug-config conf/noise_aug.yml exp/xvector_nnet/model.pt data/train exp/xvectors_train [data/train_aug]"
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --use-gpu <bool|false> # If true, use GPU."
echo " --nj <n|30> # Number of jobs"
echo " --stage <stage|0> # To control partial reruns"
echo " --use-bin-vad <bool|true> # If true, uses binary VAD from vad.scp"
echo " --write-utt2num-frames <bool|tru> # If true, write utt2num_frames file."
echo " --chunk-length <n|0> # If provided, applies encoder with specified chunk-length and "
echo " # concatenates the chunks outputs before pooling"
echo " --feat-config <str> # feature/mvn config file"
echo " --aug-config <str> # augmentation config file"
echo " --random-utt-length # If true, extracts a random chunk from the utterance between "
echo " # min_utt_length and max_utt_length"
echo " --min-utt-length <n|0> # "
echo " --max-utt-length <n|0> # "


fi

nnet_file=$1
data_dir=$2
output_dir=$3
bpe_model=$4

for f in $data_dir/wav.scp ; do
[ ! -f $f ] && echo "No such file $f" && exit 1;
done

log_dir=$output_dir/log
mkdir -p $log_dir

num_gpus=0
args=""
if [ "$use_gpu" == "true" ];then
cmd="$cmd --gpu 1"
num_gpus=1
args="--use-gpu"
fi

if [ "$write_utt2num_frames" == "true" ];then
write_num_frames_opt="--write-num-frames $output_dir/utt2num_frames.JOB"
fi

if [ $stage -le 0 ];then
set +e
$cmd JOB=1:$nj $output_dir/log/decode_transducer.JOB.log \
hyp_utils/conda_env.sh --num-gpus $num_gpus \
decode_wav2transducer.py \
--part-idx JOB --num-parts $nj \
--input $data_dir/wav.scp \
--model-path $nnet_file \
--bpe-model $bpe_model \
--output $output_dir/transducer.JOB.text
set -e
fi

if [ $stage -le 1 ];then
echo "compute wer"
cat $output_dir/transducer.*.text > $output_dir/transducer.text
compute-wer --text --mode=present ark:$data_dir/text ark:$output_dir/transducer.text
fi
Loading