Skip to content

Persephone refactor #144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,14 @@ The full API is described in the documentation page [https://hyperion-ml.readthe
```
conda create --name ${your_env} python=3.11
conda activate ${your_env}
conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
# We used PyTorch 2.0.1, other versions may work too
conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8 -c pytorch -c nvidia
# If using k2 for ASR
wget https://huggingface.co/csukuangfj/k2/resolve/main/ubuntu-cuda/k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
pip install k2-1.24.4.dev20240223+cuda11.8.torch2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
```

For systems with cuda 10.2 driver:
For older systems with cuda 10.2 driver:
```
conda create --name ${your_env} python=3.10
conda activate ${your_env}
Expand Down
3 changes: 3 additions & 0 deletions egs/librispeech/v1/conf/fbank80_specaug1_mn_16k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ spec_augment:
mask_method: mean
mvn:
norm_var: false
left_context: 0
right_context: 0

9 changes: 9 additions & 0 deletions egs/librispeech/v1/conf/sp_unigram_1000.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
vocab_size: 1000
model_type: unigram
char_coverage: 1.0
unk_id: 2
user_defined_symbols:
- <blk>
- <sos/eos>
uppercase_text: true

9 changes: 9 additions & 0 deletions egs/librispeech/v1/conf/sp_unigram_512.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
vocab_size: 512
model_type: unigram
char_coverage: 1.0
unk_id: 2
user_defined_symbols:
- <blk>
- <sos/eos>
uppercase_text: true

Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,31 @@ data:
wav_scale: 1
aug_cfgs:
- conf/speed_reverb_noise10-20dB_aug.yaml
tokenizer_mappings:
- text->text
tokenizer_files:
- data/token_librispeech_train-960_unigram_512/tokenizer.yaml
return_segment_info:
- text
sampler:
sampler_type: bucketing_seg_sampler
max_batch_length: 625.
max_batch_length: 1500.
min_batch_size: 1
drop_last: false
data_loader:
num_workers: 8
val:
dataset:
wav_scale: 1
tokenizer_mappings:
- text->text
tokenizer_files:
- data/token_librispeech_train-960_unigram_512/tokenizer.yaml
return_segment_info:
- text
sampler:
sampler_type: bucketing_seg_sampler
max_batch_length: 625
max_batch_length: 1500.
min_batch_size: 1
drop_last: true
data_loader:
Expand All @@ -36,7 +44,7 @@ model:
num_blocks: 16
d_ff: 576
in_layer_type: conv2d-sub
decoder:
rnnt_decoder:
rnnt_loss: k2_pruned
simple_loss_scale: 0.2
predictor:
Expand All @@ -62,7 +70,8 @@ trainer:
min_lr: 1e-6
warmup_steps: 25000
update_lr_on_opt_step: true
grad_clip: 100
# grad_clip: 100
grad_clip: 20
use_amp: true
log_interval: 1000
epochs: 120
Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
# Conformer + RNN-T

# training data
nnet_train_data=train_960h
nnet_val__data=dev_all
nnet_train_data=librispeech_train-960
nnet_val_data=librispeech_dev

# tokenizer
bpe_model=data/lang_bpe_1000/bpe.model
token_train_data=librispeech_train-960
token_cfg=conf/sp_unigram_512.yaml
token_dir=data/token_${token_train_data}_unigram_512
token_model=$token_dir/tokenizer.model

# rnn-t cfg
nnet_type=conformer_v1_rnn_transducer
nnet_name=fbank80_mn_conf16x144_rnnt_k2_pruned.v1.0p
nnet_s1_base_cfg=conf/train_${nnet_name}.s1.yaml
nnet_s1_args=""
nnet_s1_cfg=conf/train_${nnet_name}.s1.yaml
nnet_s1_name=$nnet_name.s1

nnet_s1_dir=exp/asr_nnets/$nnet_s1_name
Expand Down
25 changes: 25 additions & 0 deletions egs/librispeech/v1/run_001_prepare_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,28 @@ if [ $stage -le 1 ]; then
touch data/lhotse_librispeech/.librispeech.done
fi
fi

if [ $stage -le 2 ];then
echo "Stage 2: Convert Manifest to Hyperion Datasets"
for data in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other
do
hyperion-dataset from_lhotse \
--recordings-file data/lhotse_librispeech/librispeech_recordings_${data}.jsonl.gz \
--supervisions-file data/lhotse_librispeech/librispeech_supervisions_${data}.jsonl.gz \
--dataset data/librispeech_${data}
done

fi

if [ $stage -le 3 ];then
echo "Stage 3: Merge Librispeech train sets"
hyperion-dataset merge \
--input-datasets data/librispeech_train-{clean-100,clean-360,other-500} \
--dataset data/librispeech_train-960

echo "Stage 3: Merge Librispeech dev sets"
hyperion-dataset merge \
--input-datasets data/librispeech_dev-{clean,other} \
--dataset data/librispeech_dev

fi
102 changes: 102 additions & 0 deletions egs/librispeech/v1/run_002_prepare_noises_rirs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#!/bin/bash
# Copyright
# 2020 Johns Hopkins University (Author: Jesus Villalba)
# Apache 2.0.
#
. ./cmd.sh
. ./path.sh
set -e

stage=1
nj=10
config_file=default_config.sh
. parse_options.sh || exit 1;
. $config_file
. datapath.sh

# We prepare the noise files and RIR for online speech augmentation
if [ $stage -le 1 ]; then
for name in noise music speech
do
hyperion-prepare-data musan \
--corpus-dir $musan_root \
--subset $name \
--output-dir data/musan_$name
done
fi

if [ $stage -le 2 ]; then
# # Prepare to distribute data over multiple machines
# # This only does something at CLSP grid
# hyp_utils/create_data_split_dirs.sh $vad_dir $USER/hyp-data/voxceleb/v1.2/vad $nodes

for name in musan_noise musan_music
do
input_data_dir=data/$name
output_data_dir=data/${name}_proc_audio
output_dir=exp/proc_audio/$name
$train_cmd JOB=1:$nj $output_dir/log/preproc_audios_${name}.JOB.log \
hyp_utils/conda_env.sh \
hyperion-preprocess-audio-files \
--audio-format flac \
--part-idx JOB --num-parts $nj \
--recordings-file $input_data_dir/recordings.csv \
--output-path $output_dir \
--output-recordings-file $output_dir/recordings.JOB.csv

hyperion-tables cat \
--table-type recordings \
--output-file $output_dir/recordings.csv --num-tables $nj
hyperion-dataset set_recordings \
--dataset $input_data_dir \
--recordings-file $output_dir/recordings.csv \
--output-dataset $output_data_dir


done
fi

if [ $stage -le 3 ]; then
# Create Babble noise from MUSAN speech files
for name in musan_speech
do
input_data_dir=data/$name
output_data_dir=data/${name}_babble
output_dir=exp/proc_audio/${name}_babble
$train_cmd $output_dir/log/make_babble_noise_${name}.log \
hyp_utils/conda_env.sh \
hyperion-make-babble-noise-audio-files \
--audio-format flac \
--min-spks 3 --max-spks 10 --num-reuses 5 \
--recordings-file $input_data_dir/recordings.csv \
--output-path $output_dir \
--output-recordings-file $output_data_dir/recordings.csv
hyperion-dataset make_from_recordings \
--dataset $output_data_dir \
--recordings-file $output_data_dir/recordings.csv
done
fi

if [ $stage -le 4 ]; then
if [ ! -d "RIRS_NOISES" ]; then
# Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
unzip rirs_noises.zip
fi
hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/smallroom --output-dir data/rirs_smallroom
hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/simulated_rirs/mediumroom --output-dir data/rirs_mediumroom
hyperion-prepare-data rirs --corpus-dir RIRS_NOISES/real_rirs_isotropic_noises --output-dir data/rirs_real
for rirs in rirs_smallroom rirs_mediumroom rirs_real
do
output_dir=exp/rirs/$rirs
data_dir=data/$rirs
$train_cmd $output_dir/log/pack_rirs_${name}.log \
hyp_utils/conda_env.sh \
hyperion-pack-wav-rirs ${args} --input $data_dir/recordings.csv \
--output h5,csv:$output_dir/rirs.h5,$output_dir/rirs.csv || exit 1;
hyperion-dataset add_features --dataset $data_dir \
--features-name rirs --features-file $output_dir/rirs.csv

done
fi

25 changes: 25 additions & 0 deletions egs/librispeech/v1/run_003_train_tokenizers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash
# Copyright
# 2020 Johns Hopkins University (Author: Jesus Villalba)
# Apache 2.0.
#
. ./cmd.sh
. ./path.sh
set -e

stage=1
nj=10
config_file=default_config.sh
. parse_options.sh || exit 1;
. $config_file
. datapath.sh

if [ $stage -le 1 ];then
$train_cmd \
$token_dir/train_sp.log \
hyperion-train-tokenizer sentencepiece \
--cfg $token_cfg \
--segments-file data/$token_train_data/segments.csv \
--tokenizer-path $token_dir

fi
46 changes: 46 additions & 0 deletions egs/librispeech/v1/run_004_train_asr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/bash
# Copyright
# 2022 Johns Hopkins University (Author: Yen-Ju Lu)
# Apache 2.0.
#
. ./cmd.sh
. ./path.sh
set -e

stage=1
ngpu=2
config_file=default_config.sh
interactive=false
num_workers=""
use_tb=false
use_wandb=false

. parse_options.sh || exit 1;
. $config_file
. datapath.sh

train_dir=data/${nnet_train_data}
val_dir=data/${nnet_val_data}

if [ "$interactive" == "true" ];then
export cuda_cmd=run.pl
fi

# Network Training
if [ $stage -le 1 ]; then

mkdir -p $nnet_s1_dir/log
$cuda_cmd \
--gpu $ngpu $nnet_s1_dir/log/train.log \
hyp_utils/conda_env.sh --conda-env $HYP_ENV --num-gpus $ngpu \
hyperion-train-wav2rnn-transducer $nnet_type \
--cfg $nnet_s1_cfg \
--data.train.dataset.recordings-file $train_dir/recordings.csv \
--data.train.dataset.segments-file $train_dir/segments.csv \
--data.val.dataset.recordings-file $val_dir/recordings.csv \
--data.val.dataset.segments-file $val_dir/segments.csv \
--trainer.exp-path $nnet_s1_dir $args \
--num-gpus $ngpu
#--data.train.dataset.bpe-model $token_model \
fi

3 changes: 2 additions & 1 deletion egs/voxceleb/ssl.v1/conf/train_fwseresnet34_dino_v1.2.2.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ trainer:
init_momentum: 0.996
momentum: 1.0
warmup_steps: 500000
grad_clip: 15
# grad_clip: 15
grad_clip: 5
use_amp: true
log_interval: 1000
epochs: 100
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,20 @@ nnet_s1_base_cfg=conf/train_fwseresnet34_dino_v1.2.2.yaml
nnet_s1_name=$nnet_name.s1
nnet_s1_dir=exp/xvector_nnets/$nnet_s1_name
nnet_s1=$nnet_s1_dir/teacher_model_ep0034.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0025.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0038.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0043.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0044.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0046.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0049.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0054.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0058.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0064.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0067.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0071.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0077.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0083.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0088.pth
nnet_s1=$nnet_s1_dir/teacher_model_ep0094.pth

# clustering of dino embeddings
cluster_method=cos_ahc_plda_ahc
Expand Down
Loading