Support Whisper-PMFA (#356)

* Support Whisper-PMFA * Support Whisper-PMFA * Support Whisper-PMFA * Support Whisper-PMFA * Support Whisper-PMFA * Support Whisper-PMFA * Support Whisper-PMFA * Support Whisper-PMFA * Support Whisper-PMFA --------- Co-authored-by: Aurora1818 <zhaoyy22@mails.tsinghua.edu.cn>
wenet-e2e · Aug 30, 2024 · d5f6097 · d5f6097
1 parent 03ceb00
commit d5f6097
Show file tree

Hide file tree

Showing 21 changed files with 1,126 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,4 @@ tensorboard
 external_tools
 pretrained_models
 s3prl_hub
+whisper_hub
diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
@@ -1,6 +1,8 @@
 This is a **WeSpeaker** recipe for the Voxceleb 1&2 dataset. VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube. See https://www.robots.ox.ac.uk/~vgg/data/voxceleb/ for more detailed information.
 
 The following recipes are provided:
+* v1: **Fully-Supervised** train on Voxceleb 1 development set and evaluate on Voxceleb1-O trials.
+
 * v2: **Fully-Supervised** train on Voxceleb 2 development set and evaluate on three official trials.
 
 * v2_deprecated: Deprecated version of fully-supervised train on Voxceleb dataset (deprecated IO).

diff --git a/examples/voxceleb/v1/Whisper-PMFA/README.md b/examples/voxceleb/v1/Whisper-PMFA/README.md
@@ -0,0 +1,24 @@
+## Results
+
+* Setup: mel80, num_frms500, epoch8, ArcMargin, aug_prob0.6, speed_perturb (no spec_aug)
+
+* Scoring: cosine (sub mean of vox1_dev), AS-Norm
+
+* Metric: EER(%)
+
+* 🔥 UPDATE 2024.08: We support Whisper based speaker verification framework Whisper-PMFA. Related papers:
+
+    * [Whisper-PMFA: Partial Multi-Scale Feature Aggregation for Speaker Verification using Whisper Models ](https://arxiv.org/pdf/2408.15585)
+
+
+
+| Model                                | AS-Norm | Params | vox1-O-clean |
+| :----------------------------------- | ------- | ------ | :----------: |
+| ECAPA_TDNN_GLOB_c512-ASTP-emb192     | ×       | 6.19M  |     2.23     |
+|                                      | √       | 6.19M  |     2.00     |
+| ResNet34-TSTP-emb256                 | ×       | 6.63M  |     1.99     |
+|                                      | √       | 6.63M  |     1.88     |
+| Whisper-PMFA                         | ×       | 478.7M |     1.62     |
+|                                      | √       | 478.7M |   **1.42**   |
+| Whisper-PMFA with LoRa (Coming soon) | √       | 10.9M  |     1.62     |
+
diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage0.yaml
@@ -0,0 +1,78 @@
+### train configuraton
+
+exp_dir: exp/test
+gpus: "[0,1]"
+num_avg: 10
+enable_amp: False # whether enable automatic mixed precision training
+
+seed: 42
+num_epochs: 4
+save_epoch_interval: 1 # save model every 5 epochs
+log_batch_interval: 100 # log every 100 batchs
+
+dataloader_args:
+  batch_size: 70
+  num_workers: 12
+  pin_memory: False
+  prefetch_factor: 8
+  drop_last: True
+
+dataset_args:
+  shuffle: True
+  shuffle_args:
+    shuffle_size: 2500
+  resample_rate: 16000
+  speed_perturb: True
+  num_frms: 500
+  aug_prob: 0.6 # prob to add reverb & noise aug per sample
+  frontend: whisper_encoder
+  whisper_encoder_args:
+    frozen: True
+    n_mels: 80
+    num_blocks: 24
+    output_size: 1280
+    n_head: 20
+    layer_st: 16
+    layer_ed: 23
+    model_path: whisper_hub/large-v2.pt
+  spec_aug: False
+  spec_aug_args:
+    num_t_mask: 1
+    num_f_mask: 1
+    max_t: 10
+    max_f: 8
+    prob: 0.6
+
+model: Whisper_PMFA_large_v2
+model_init: null
+model_args:
+  embed_dim: 192
+projection_args:
+  project_type: "arc_margin" # add_margin, arc_margin, sphere, softmax
+  scale: 32.0
+  easy_margin: False
+
+margin_scheduler: MarginScheduler
+margin_update:
+  initial_margin: 0.2
+  final_margin: 0.2
+  increase_start_epoch: 0
+  fix_start_epoch: 30
+  update_margin: True
+  increase_type: "exp" # exp, linear
+
+loss: CrossEntropyLoss
+loss_args: {}
+
+optimizer: SGD
+optimizer_args:
+  momentum: 0.9
+  nesterov: True
+  weight_decay: 0.0001
+
+scheduler: ExponentialDecrease
+scheduler_args:
+  initial_lr: 0.0025
+  final_lr: 0.00113
+  warm_up_epoch: 0
+  warm_from_zero: False
diff --git a/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml b/examples/voxceleb/v1/Whisper-PMFA/conf/whisper_PMFA_stage1.yaml
@@ -0,0 +1,77 @@
+### train configuraton
+
+exp_dir: exp/test
+gpus: "[0,1]"
+num_avg: 10
+enable_amp: False # whether enable automatic mixed precision training
+
+seed: 42
+num_epochs: 8
+save_epoch_interval: 1 # save model every 5 epochs
+log_batch_interval: 100 # log every 100 batchs
+
+dataloader_args:
+  batch_size: 15
+  num_workers: 12
+  pin_memory: False
+  prefetch_factor: 8
+  drop_last: True
+
+dataset_args:
+  shuffle: True
+  shuffle_args:
+    shuffle_size: 2500
+  resample_rate: 16000
+  speed_perturb: True
+  num_frms: 500
+  aug_prob: 0.6 # prob to add reverb & noise aug per sample
+  frontend: whisper_encoder
+  whisper_encoder_args:
+    frozen: False
+    n_mels: 80
+    num_blocks: 24
+    output_size: 1280
+    n_head: 20
+    layer_st: 16
+    layer_ed: 23
+  spec_aug: False
+  spec_aug_args:
+    num_t_mask: 1
+    num_f_mask: 1
+    max_t: 10
+    max_f: 8
+    prob: 0.6
+
+model: Whisper_PMFA_large_v2
+model_init: null
+model_args:
+  embed_dim: 192
+projection_args:
+  project_type: "arc_margin" # add_margin, arc_margin, sphere, softmax
+  scale: 32.0
+  easy_margin: False
+
+margin_scheduler: MarginScheduler
+margin_update:
+  initial_margin: 0.2
+  final_margin: 0.2
+  increase_start_epoch: 0
+  fix_start_epoch: 30
+  update_margin: True
+  increase_type: "exp" # exp, linear
+
+loss: CrossEntropyLoss
+loss_args: {}
+
+optimizer: SGD
+optimizer_args:
+  momentum: 0.9
+  nesterov: True
+  weight_decay: 0.0001
+
+scheduler: ExponentialDecrease
+scheduler_args:
+  initial_lr: 0.0025
+  final_lr: 0.00073
+  warm_up_epoch: 0
+  warm_from_zero: False
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/download_data.sh b/examples/voxceleb/v1/Whisper-PMFA/local/download_data.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+download_dir=data/download_data
+
+. tools/parse_options.sh || exit 1
+
+[ ! -d ${download_dir} ] && mkdir -p ${download_dir}
+
+if [ ! -f ${download_dir}/musan.tar.gz ]; then
+  echo "Downloading musan.tar.gz ..."
+  wget --no-check-certificate https://openslr.elda.org/resources/17/musan.tar.gz -P ${download_dir}
+  md5=$(md5sum ${download_dir}/musan.tar.gz | awk '{print $1}')
+  [ $md5 != "0c472d4fc0c5141eca47ad1ffeb2a7df" ] && echo "Wrong md5sum of musan.tar.gz" && exit 1
+fi
+
+if [ ! -f ${download_dir}/rirs_noises.zip ]; then
+  echo "Downloading rirs_noises.zip ..."
+  wget --no-check-certificate https://us.openslr.org/resources/28/rirs_noises.zip -P ${download_dir}
+  md5=$(md5sum ${download_dir}/rirs_noises.zip | awk '{print $1}')
+  [ $md5 != "e6f48e257286e05de56413b4779d8ffb" ] && echo "Wrong md5sum of rirs_noises.zip" && exit 1
+fi
+
+if [ ! -f ${download_dir}/vox1_test_wav.zip ]; then
+  echo "Downloading vox1_test_wav.zip ..."
+  wget --no-check-certificate https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_test_wav.zip -P ${download_dir}
+  md5=$(md5sum ${download_dir}/vox1_test_wav.zip | awk '{print $1}')
+  [ $md5 != "185fdc63c3c739954633d50379a3d102" ] && echo "Wrong md5sum of vox1_test_wav.zip" && exit 1
+fi
+
+if [ ! -f ${download_dir}/vox1_dev_wav.zip ]; then
+  echo "Downloading vox1_dev_wav.zip ..."
+  for part in a b c d; do
+    wget --no-check-certificate https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/vox1_dev_wav_parta${part} -P ${download_dir} &
+  done
+  wait
+  cat ${download_dir}/vox1_dev* >${download_dir}/vox1_dev_wav.zip
+  md5=$(md5sum ${download_dir}/vox1_dev_wav.zip | awk '{print $1}')
+  [ $md5 != "ae63e55b951748cc486645f532ba230b" ] && echo "Wrong md5sum of vox1_dev_wav.zip" && exit 1
+fi
+
+
+echo "Download success !!!"
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/download_whisper.sh b/examples/voxceleb/v1/Whisper-PMFA/local/download_whisper.sh
@@ -0,0 +1,13 @@
+download_dir=data/whisper_pretrained_model
+
+. tools/parse_options.sh || exit 1
+
+[ ! -d ${download_dir} ] && mkdir -p ${download_dir}
+
+if [ ! -f ${download_dir}/large-v2.pt ]; then
+  echo "Downloading large-v2.pt ..."
+  wget --no-check-certificate https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt -P ${download_dir}
+  md5=$(md5sum ${download_dir}/large-v2.pt | awk '{print $1}')
+  [ $md5 != "668764447eeda98eeba5ef7bfcb4cc3d" ] && echo "Wrong md5sum of musan.tar.gz" && exit 1
+fi
+
diff --git a/examples/voxceleb/v1/Whisper-PMFA/local/extract_vox.sh b/examples/voxceleb/v1/Whisper-PMFA/local/extract_vox.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Hongji Wang (jijijiang77@gmail.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+exp_dir=''
+model_path=''
+nj=4
+gpus="[0,1]"
+data_type="shard"  # shard/raw/feat
+data=data
+
+. tools/parse_options.sh
+set -e
+
+data_name_array=("vox1_dev" "vox1_test")
+data_list_path_array=("${data}/vox1_dev/${data_type}.list" "${data}/vox1_test/${data_type}.list")
+data_scp_path_array=("${data}/vox1_dev/wav.scp" "${data}/vox1_test/wav.scp") # to count the number of wavs
+nj_array=($nj $nj)
+batch_size_array=(16 1) # batch_size of test set must be 1 !!!
+num_workers_array=(4 1)
+count=${#data_name_array[@]}
+
+for i in $(seq 0 $(($count - 1))); do
+  wavs_num=$(wc -l ${data_scp_path_array[$i]} | awk '{print $1}')
+  bash tools/extract_embedding.sh --exp_dir ${exp_dir} \
+    --model_path $model_path \
+    --data_type ${data_type} \
+    --data_list ${data_list_path_array[$i]} \
+    --wavs_num ${wavs_num} \
+    --store_dir ${data_name_array[$i]} \
+    --batch_size ${batch_size_array[$i]} \
+    --num_workers ${num_workers_array[$i]} \
+    --nj ${nj_array[$i]} \
+    --gpus $gpus &
+done
+
+wait
+
+echo "Embedding dir is (${exp_dir}/embeddings)."