Skip to content

Commit

Permalink
Support On-the-fly Features Extraction (#145)
Browse files Browse the repository at this point in the history
Support on-the-fly features extraction for the large-scale data preprocessing
  • Loading branch information
RMSnow authored Feb 29, 2024
1 parent a58c386 commit b2102dc
Show file tree
Hide file tree
Showing 34 changed files with 1,146 additions and 395 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ egs/svc/*wavmark
egs/svc/custom
egs/svc/*/dev*
egs/svc/dev_exp_config.json
egs/svc/dev
bins/svc/demo*
bins/svc/preprocess_custom.py
data
Expand All @@ -47,6 +48,7 @@ ckpts
*.wav
*.flac
pretrained/wenet/*conformer_exp
pretrained/bigvgan/args.json
!egs/tts/VALLE/prompt_examples/*.wav

# Runtime data dirs
Expand Down
47 changes: 6 additions & 41 deletions config/base.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"mel_min_max_norm": false,
// lingusitic features
"extract_phone": false,
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
// content features
"extract_whisper_feature": false,
"extract_contentvec_feature": false,
Expand Down Expand Up @@ -94,7 +94,7 @@
"utt2emo": "utt2emo", // used for multi-emotion dataset
// Features used for model training
"use_text": false,
"use_phone": false,
"use_phone": false,
"use_phn_seq": false,
"use_lab": false,
"use_linear": false,
Expand All @@ -118,12 +118,10 @@
"use_label": false,
"use_one_hot": false,
"use_amplitude_phase": false,
"data_augment": false,
"align_mel_duration": false
},
"train": {
"ddp": false,
"random_seed": 970227,
"ddp": true,
"batch_size": 16,
"max_steps": 1000000,
// Trackers
Expand Down Expand Up @@ -182,39 +180,6 @@
"save_checkpoints_steps": 10000,
"valid_interval": 10000,
"keep_checkpoint_max": 5,
"multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
"max_epoch": -1,
// -1 means no limit
"save_checkpoint_stride": [
5,
20
],
// unit is epoch
"keep_last": [
3,
-1
],
// -1 means infinite, if one number will broadcast
"run_eval": [
false,
true
],
// Batchsampler
"sampler": {
"holistic_shuffle": true,
"drop_last": true
},
// Dataloader
"dataloader": {
"num_worker": 32,
"pin_memory": true
},
// Trackers
"tracker": [
"tensorboard"
// "wandb",
// "cometml",
// "mlflow",
],
},
}
"multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model;
}
}
2 changes: 1 addition & 1 deletion config/comosvc.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_config": "config/base.json",
"base_config": "config/svc/base.json",
"model_type": "DiffComoSVC",
"task_type": "svc",
"preprocess": {
Expand Down
89 changes: 89 additions & 0 deletions config/svc/base.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
{
"base_config": "config/base.json",
"task_type": "svc",
"preprocess": {
// data augmentations
"use_pitch_shift": false,
"use_formant_shift": false,
"use_time_stretch": false,
"use_equalizer": false,
// Online or offline features extraction ("offline" or "online")
"features_extraction_mode": "offline",
// acoustic features
"extract_mel": true,
"mel_min_max_norm": true,
"extract_pitch": true,
"pitch_extractor": "parselmouth",
"extract_uv": true,
"extract_energy": true,
// content features
"extract_whisper_feature": false,
"whisper_sample_rate": 16000,
"extract_contentvec_feature": false,
"contentvec_sample_rate": 16000,
"extract_wenet_feature": false,
"wenet_sample_rate": 16000,
"extract_mert_feature": false,
"mert_sample_rate": 16000,
// Default config for whisper
"whisper_frameshift": 0.01,
"whisper_downsample_rate": 2,
// Default config for content vector
"contentvec_frameshift": 0.02,
// Default config for mert
"mert_model": "m-a-p/MERT-v1-330M",
"mert_feature_layer": -1,
"mert_hop_size": 320,
// 24k
"mert_frameshit": 0.01333,
// 10ms
"wenet_frameshift": 0.01,
// wenetspeech is 4, gigaspeech is 6
"wenet_downsample_rate": 4,
// Default config
"n_mel": 100,
"win_size": 1024,
// todo
"hop_size": 256,
"sample_rate": 24000,
"n_fft": 1024,
// todo
"fmin": 0,
"fmax": 12000,
// todo
"f0_min": 50,
// ~C2
"f0_max": 1100,
//1100, // ~C6(1100), ~G5(800)
"pitch_bin": 256,
"pitch_max": 1100.0,
"pitch_min": 50.0,
"is_label": true,
"is_mu_law": true,
"bits": 8,
"mel_min_max_stats_dir": "mel_min_max_stats",
"whisper_dir": "whisper",
"contentvec_dir": "contentvec",
"wenet_dir": "wenet",
"mert_dir": "mert",
// Extract content features using dataloader
"pin_memory": true,
"num_workers": 8,
"content_feature_batch_size": 16,
// Features used for model training
"use_mel": true,
"use_min_max_norm_mel": true,
"use_frame_pitch": true,
"use_uv": true,
"use_interpolation_for_uv": false,
"use_frame_energy": true,
"use_log_scale_pitch": false,
"use_log_scale_energy": false,
"use_spkid": true,
// Meta file
"train_file": "train.json",
"valid_file": "test.json",
"spk2id": "singers.json",
"utt2spk": "utt2singer"
},
}
100 changes: 8 additions & 92 deletions config/diffusion.json → config/svc/diffusion.json
Original file line number Diff line number Diff line change
@@ -1,102 +1,20 @@
{
// FIXME: THESE ARE LEGACY
"base_config": "config/base.json",
"model_type": "diffusion",
"task_type": "svc",
"preprocess": {
// data augmentations
"use_pitch_shift": false,
"use_formant_shift": false,
"use_time_stretch": false,
"use_equalizer": false,
// acoustic features
"extract_mel": true,
"mel_min_max_norm": true,
"extract_pitch": true,
"pitch_extractor": "parselmouth",
"extract_uv": true,
"extract_energy": true,
// content features
"extract_whisper_feature": false,
"whisper_sample_rate": 16000,
"extract_contentvec_feature": false,
"contentvec_sample_rate": 16000,
"extract_wenet_feature": false,
"wenet_sample_rate": 16000,
"extract_mert_feature": false,
"mert_sample_rate": 16000,
// Default config for whisper
"whisper_frameshift": 0.01,
"whisper_downsample_rate": 2,
// Default config for content vector
"contentvec_frameshift": 0.02,
// Default config for mert
"mert_model": "m-a-p/MERT-v1-330M",
"mert_feature_layer": -1,
"mert_hop_size": 320,
// 24k
"mert_frameshit": 0.01333,
// 10ms
"wenet_frameshift": 0.01,
// wenetspeech is 4, gigaspeech is 6
"wenet_downsample_rate": 4,
// Default config
"n_mel": 100,
"win_size": 1024,
// todo
"hop_size": 256,
"sample_rate": 24000,
"n_fft": 1024,
// todo
"fmin": 0,
"fmax": 12000,
// todo
"f0_min": 50,
// ~C2
"f0_max": 1100,
//1100, // ~C6(1100), ~G5(800)
"pitch_bin": 256,
"pitch_max": 1100.0,
"pitch_min": 50.0,
"is_label": true,
"is_mu_law": true,
"bits": 8,
"mel_min_max_stats_dir": "mel_min_max_stats",
"whisper_dir": "whisper",
"contentvec_dir": "contentvec",
"wenet_dir": "wenet",
"mert_dir": "mert",
// Extract content features using dataloader
"pin_memory": true,
"num_workers": 8,
"content_feature_batch_size": 16,
// Features used for model training
"use_mel": true,
"use_min_max_norm_mel": true,
"use_frame_pitch": true,
"use_uv": true,
"use_frame_energy": true,
"use_log_scale_pitch": false,
"use_log_scale_energy": false,
"use_spkid": true,
// Meta file
"train_file": "train.json",
"valid_file": "test.json",
"spk2id": "singers.json",
"utt2spk": "utt2singer"
},
"base_config": "config/svc/base.json",
"model": {
"condition_encoder": {
"merge_mode": "add",
// Prosody Features
"use_f0": true,
"use_uv": true,
"use_energy": true,
// Quantization (0 for not quantization)
"input_melody_dim": 1,
"use_log_f0": true,
"n_bins_melody": 256,
//# Quantization (0 for not quantization)
"output_melody_dim": 384,
"input_loudness_dim": 1,
"use_log_loudness": true,
"n_bins_loudness": 256,
"output_loudness_dim": 384,
// Semantic Features
"use_whisper": false,
"use_contentvec": false,
"use_wenet": false,
Expand All @@ -106,12 +24,11 @@
"mert_dim": 256,
"wenet_dim": 512,
"content_encoder_dim": 384,
// Speaker Features
"output_singer_dim": 384,
"singer_table_size": 512,
"output_content_dim": 384,
"use_spkid": true
},
// FIXME: FOLLOWING ARE NEW!!
"diffusion": {
"scheduler": "ddpm",
"scheduler_settings": {
Expand Down Expand Up @@ -159,7 +76,6 @@
}
}
},
// FIXME: FOLLOWING ARE NEW!!
"train": {
// Basic settings
"batch_size": 64,
Expand Down
2 changes: 1 addition & 1 deletion config/transformer.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_config": "config/base.json",
"base_config": "config/svc/base.json",
"model_type": "Transformer",
"task_type": "svc",
"preprocess": {
Expand Down
Loading

0 comments on commit b2102dc

Please sign in to comment.