diff --git a/.gitignore b/.gitignore index ab181f9a..67f8d188 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ egs/svc/*wavmark egs/svc/custom egs/svc/*/dev* egs/svc/dev_exp_config.json +egs/svc/dev bins/svc/demo* bins/svc/preprocess_custom.py data @@ -47,6 +48,7 @@ ckpts *.wav *.flac pretrained/wenet/*conformer_exp +pretrained/bigvgan/args.json !egs/tts/VALLE/prompt_examples/*.wav # Runtime data dirs diff --git a/config/base.json b/config/base.json index ca0af9bf..144d0759 100644 --- a/config/base.json +++ b/config/base.json @@ -41,7 +41,7 @@ "mel_min_max_norm": false, // lingusitic features "extract_phone": false, - "lexicon_path": "./text/lexicon/librispeech-lexicon.txt", + "lexicon_path": "./text/lexicon/librispeech-lexicon.txt", // content features "extract_whisper_feature": false, "extract_contentvec_feature": false, @@ -94,7 +94,7 @@ "utt2emo": "utt2emo", // used for multi-emotion dataset // Features used for model training "use_text": false, - "use_phone": false, + "use_phone": false, "use_phn_seq": false, "use_lab": false, "use_linear": false, @@ -118,12 +118,10 @@ "use_label": false, "use_one_hot": false, "use_amplitude_phase": false, - "data_augment": false, "align_mel_duration": false }, "train": { - "ddp": false, - "random_seed": 970227, + "ddp": true, "batch_size": 16, "max_steps": 1000000, // Trackers @@ -182,39 +180,6 @@ "save_checkpoints_steps": 10000, "valid_interval": 10000, "keep_checkpoint_max": 5, - "multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model; - "max_epoch": -1, - // -1 means no limit - "save_checkpoint_stride": [ - 5, - 20 - ], - // unit is epoch - "keep_last": [ - 3, - -1 - ], - // -1 means infinite, if one number will broadcast - "run_eval": [ - false, - true - ], - // Batchsampler - "sampler": { - "holistic_shuffle": true, - "drop_last": true - }, - // Dataloader - "dataloader": { - "num_worker": 32, - "pin_memory": true - }, - // Trackers - "tracker": [ - "tensorboard" - // "wandb", - // "cometml", - // "mlflow", - ], - }, -} + "multi_speaker_training": false // True: train multi-speaker model; False: training single-speaker model; + } +} \ No newline at end of file diff --git a/config/comosvc.json b/config/comosvc.json index a8bec1e3..c1af0e58 100644 --- a/config/comosvc.json +++ b/config/comosvc.json @@ -1,5 +1,5 @@ { - "base_config": "config/base.json", + "base_config": "config/svc/base.json", "model_type": "DiffComoSVC", "task_type": "svc", "preprocess": { diff --git a/config/svc/base.json b/config/svc/base.json new file mode 100644 index 00000000..591a5e6a --- /dev/null +++ b/config/svc/base.json @@ -0,0 +1,89 @@ +{ + "base_config": "config/base.json", + "task_type": "svc", + "preprocess": { + // data augmentations + "use_pitch_shift": false, + "use_formant_shift": false, + "use_time_stretch": false, + "use_equalizer": false, + // Online or offline features extraction ("offline" or "online") + "features_extraction_mode": "offline", + // acoustic features + "extract_mel": true, + "mel_min_max_norm": true, + "extract_pitch": true, + "pitch_extractor": "parselmouth", + "extract_uv": true, + "extract_energy": true, + // content features + "extract_whisper_feature": false, + "whisper_sample_rate": 16000, + "extract_contentvec_feature": false, + "contentvec_sample_rate": 16000, + "extract_wenet_feature": false, + "wenet_sample_rate": 16000, + "extract_mert_feature": false, + "mert_sample_rate": 16000, + // Default config for whisper + "whisper_frameshift": 0.01, + "whisper_downsample_rate": 2, + // Default config for content vector + "contentvec_frameshift": 0.02, + // Default config for mert + "mert_model": "m-a-p/MERT-v1-330M", + "mert_feature_layer": -1, + "mert_hop_size": 320, + // 24k + "mert_frameshit": 0.01333, + // 10ms + "wenet_frameshift": 0.01, + // wenetspeech is 4, gigaspeech is 6 + "wenet_downsample_rate": 4, + // Default config + "n_mel": 100, + "win_size": 1024, + // todo + "hop_size": 256, + "sample_rate": 24000, + "n_fft": 1024, + // todo + "fmin": 0, + "fmax": 12000, + // todo + "f0_min": 50, + // ~C2 + "f0_max": 1100, + //1100, // ~C6(1100), ~G5(800) + "pitch_bin": 256, + "pitch_max": 1100.0, + "pitch_min": 50.0, + "is_label": true, + "is_mu_law": true, + "bits": 8, + "mel_min_max_stats_dir": "mel_min_max_stats", + "whisper_dir": "whisper", + "contentvec_dir": "contentvec", + "wenet_dir": "wenet", + "mert_dir": "mert", + // Extract content features using dataloader + "pin_memory": true, + "num_workers": 8, + "content_feature_batch_size": 16, + // Features used for model training + "use_mel": true, + "use_min_max_norm_mel": true, + "use_frame_pitch": true, + "use_uv": true, + "use_interpolation_for_uv": false, + "use_frame_energy": true, + "use_log_scale_pitch": false, + "use_log_scale_energy": false, + "use_spkid": true, + // Meta file + "train_file": "train.json", + "valid_file": "test.json", + "spk2id": "singers.json", + "utt2spk": "utt2singer" + }, +} \ No newline at end of file diff --git a/config/diffusion.json b/config/svc/diffusion.json similarity index 56% rename from config/diffusion.json rename to config/svc/diffusion.json index 904c575c..d9e538b7 100644 --- a/config/diffusion.json +++ b/config/svc/diffusion.json @@ -1,102 +1,20 @@ { - // FIXME: THESE ARE LEGACY - "base_config": "config/base.json", - "model_type": "diffusion", - "task_type": "svc", - "preprocess": { - // data augmentations - "use_pitch_shift": false, - "use_formant_shift": false, - "use_time_stretch": false, - "use_equalizer": false, - // acoustic features - "extract_mel": true, - "mel_min_max_norm": true, - "extract_pitch": true, - "pitch_extractor": "parselmouth", - "extract_uv": true, - "extract_energy": true, - // content features - "extract_whisper_feature": false, - "whisper_sample_rate": 16000, - "extract_contentvec_feature": false, - "contentvec_sample_rate": 16000, - "extract_wenet_feature": false, - "wenet_sample_rate": 16000, - "extract_mert_feature": false, - "mert_sample_rate": 16000, - // Default config for whisper - "whisper_frameshift": 0.01, - "whisper_downsample_rate": 2, - // Default config for content vector - "contentvec_frameshift": 0.02, - // Default config for mert - "mert_model": "m-a-p/MERT-v1-330M", - "mert_feature_layer": -1, - "mert_hop_size": 320, - // 24k - "mert_frameshit": 0.01333, - // 10ms - "wenet_frameshift": 0.01, - // wenetspeech is 4, gigaspeech is 6 - "wenet_downsample_rate": 4, - // Default config - "n_mel": 100, - "win_size": 1024, - // todo - "hop_size": 256, - "sample_rate": 24000, - "n_fft": 1024, - // todo - "fmin": 0, - "fmax": 12000, - // todo - "f0_min": 50, - // ~C2 - "f0_max": 1100, - //1100, // ~C6(1100), ~G5(800) - "pitch_bin": 256, - "pitch_max": 1100.0, - "pitch_min": 50.0, - "is_label": true, - "is_mu_law": true, - "bits": 8, - "mel_min_max_stats_dir": "mel_min_max_stats", - "whisper_dir": "whisper", - "contentvec_dir": "contentvec", - "wenet_dir": "wenet", - "mert_dir": "mert", - // Extract content features using dataloader - "pin_memory": true, - "num_workers": 8, - "content_feature_batch_size": 16, - // Features used for model training - "use_mel": true, - "use_min_max_norm_mel": true, - "use_frame_pitch": true, - "use_uv": true, - "use_frame_energy": true, - "use_log_scale_pitch": false, - "use_log_scale_energy": false, - "use_spkid": true, - // Meta file - "train_file": "train.json", - "valid_file": "test.json", - "spk2id": "singers.json", - "utt2spk": "utt2singer" - }, + "base_config": "config/svc/base.json", "model": { "condition_encoder": { "merge_mode": "add", + // Prosody Features + "use_f0": true, + "use_uv": true, + "use_energy": true, + // Quantization (0 for not quantization) "input_melody_dim": 1, - "use_log_f0": true, "n_bins_melody": 256, - //# Quantization (0 for not quantization) "output_melody_dim": 384, "input_loudness_dim": 1, - "use_log_loudness": true, "n_bins_loudness": 256, "output_loudness_dim": 384, + // Semantic Features "use_whisper": false, "use_contentvec": false, "use_wenet": false, @@ -106,12 +24,11 @@ "mert_dim": 256, "wenet_dim": 512, "content_encoder_dim": 384, + // Speaker Features "output_singer_dim": 384, "singer_table_size": 512, - "output_content_dim": 384, "use_spkid": true }, - // FIXME: FOLLOWING ARE NEW!! "diffusion": { "scheduler": "ddpm", "scheduler_settings": { @@ -159,7 +76,6 @@ } } }, - // FIXME: FOLLOWING ARE NEW!! "train": { // Basic settings "batch_size": 64, diff --git a/config/transformer.json b/config/transformer.json index f5b93d3d..be3514e9 100644 --- a/config/transformer.json +++ b/config/transformer.json @@ -1,5 +1,5 @@ { - "base_config": "config/base.json", + "base_config": "config/svc/base.json", "model_type": "Transformer", "task_type": "svc", "preprocess": { diff --git a/config/vitssvc.json b/config/vitssvc.json index a2572b66..53aa1642 100644 --- a/config/vitssvc.json +++ b/config/vitssvc.json @@ -1,5 +1,5 @@ { - "base_config": "config/base.json", + "base_config": "config/svc/base.json", "model_type": "VITS", "task_type": "svc", "preprocess": { @@ -11,7 +11,6 @@ "extract_uv": true, "extract_linear_spec": true, "extract_audio": true, - "mel_min_max_norm": true, // Config for features usage "use_linear": true, @@ -26,7 +25,6 @@ "use_wenet": false, "use_text": false, "use_phone": false, - "fmin": 0, "fmax": 12000, "f0_min": 50, @@ -42,13 +40,11 @@ "segment_size": 8192, "n_mel": 100, "sample_rate": 24000, - "mel_min_max_stats_dir": "mel_min_max_stats", "whisper_dir": "whisper", "contentvec_dir": "contentvec", "wenet_dir": "wenet", "mert_dir": "mert", - // Meta file "train_file": "train.json", "valid_file": "test.json", @@ -79,7 +75,6 @@ "output_singer_dim": 384, "output_content_dim": 384, "use_spkid": true, - "pitch_max": 1100.0, "pitch_min": 50.0, }, @@ -107,20 +102,43 @@ 11 ], "upsample_rates": [ - 8,8,2,2 + 8, + 8, + 2, + 2 ], "upsample_kernel_sizes": [ - 16,16,4,4 + 16, + 16, + 4, + 4 ], "upsample_initial_channel": 512, "resblock_dilation_sizes": [ - [1,3,5], - [1,3,5], - [1,3,5] + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] ] }, "melgan": { - "ratios": [8, 8, 2, 2], + "ratios": [ + 8, + 8, + 2, + 2 + ], "ngf": 32, "n_residual_layers": 3, "num_D": 3, @@ -133,10 +151,16 @@ "activation": "snakebeta", "snake_logscale": true, "upsample_rates": [ - 8,8,2,2 + 8, + 8, + 2, + 2 ], "upsample_kernel_sizes": [ - 16,16,4,4 + 16, + 16, + 4, + 4 ], "upsample_initial_channel": 512, "resblock_kernel_sizes": [ @@ -145,19 +169,37 @@ 11 ], "resblock_dilation_sizes": [ - [1,3,5], - [1,3,5], - [1,3,5] + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] ] }, "nsfhifigan": { "resblock": "1", "harmonic_num": 8, "upsample_rates": [ - 8,8,2,2 + 8, + 8, + 2, + 2 ], "upsample_kernel_sizes": [ - 16,16,4,4 + 16, + 16, + 4, + 4 ], "upsample_initial_channel": 768, "resblock_kernel_sizes": [ @@ -166,24 +208,75 @@ 11 ], "resblock_dilation_sizes": [ - [1,3,5], - [1,3,5], - [1,3,5] + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] ] }, "apnet": { - "ASP_channel": 512, - "ASP_resblock_kernel_sizes": [3,7,11], - "ASP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "ASP_input_conv_kernel_size": 7, - "ASP_output_conv_kernel_size": 7, - - "PSP_channel": 512, - "PSP_resblock_kernel_sizes": [3,7,11], - "PSP_resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], - "PSP_input_conv_kernel_size": 7, - "PSP_output_R_conv_kernel_size": 7, - "PSP_output_I_conv_kernel_size": 7, + "ASP_channel": 512, + "ASP_resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "ASP_resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "ASP_input_conv_kernel_size": 7, + "ASP_output_conv_kernel_size": 7, + "PSP_channel": 512, + "PSP_resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "PSP_resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "PSP_input_conv_kernel_size": 7, + "PSP_output_R_conv_kernel_size": 7, + "PSP_output_I_conv_kernel_size": 7, } }, }, diff --git a/egs/svc/MultipleContentsSVC/exp_config.json b/egs/svc/MultipleContentsSVC/exp_config.json index 7047855a..b607494c 100755 --- a/egs/svc/MultipleContentsSVC/exp_config.json +++ b/egs/svc/MultipleContentsSVC/exp_config.json @@ -1,5 +1,5 @@ { - "base_config": "config/diffusion.json", + "base_config": "config/svc/diffusion.json", "model_type": "DiffWaveNetSVC", "dataset": [ "m4singer", @@ -22,6 +22,7 @@ // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", // Config for features extraction + "features_extraction_mode": "offline", // Online or offline features extraction ("offline" or "online") "extract_mel": true, "extract_pitch": true, "extract_energy": true, diff --git a/evaluation/features/singing_power_ratio.py b/evaluation/features/singing_power_ratio.py index 18ad86b6..60051016 100644 --- a/evaluation/features/singing_power_ratio.py +++ b/evaluation/features/singing_power_ratio.py @@ -77,7 +77,7 @@ def extract_spr( f0 = get_f0_features_using_parselmouth( audio, cfg, - )[0] + ) # Mel length alignment length = min(len(f0), mel1.shape[-1]) diff --git a/evaluation/metrics/f0/f0_pearson_coefficients.py b/evaluation/metrics/f0/f0_pearson_coefficients.py index 8aad2835..df1fb4f2 100644 --- a/evaluation/metrics/f0/f0_pearson_coefficients.py +++ b/evaluation/metrics/f0/f0_pearson_coefficients.py @@ -66,12 +66,12 @@ def extract_fpc( f0_ref = get_f0_features_using_parselmouth( audio_ref, cfg, - )[0] + ) f0_deg = get_f0_features_using_parselmouth( audio_deg, cfg, - )[0] + ) # Subtract mean value from f0 if need_mean: diff --git a/evaluation/metrics/f0/f0_rmse.py b/evaluation/metrics/f0/f0_rmse.py index af072ef2..d664145e 100644 --- a/evaluation/metrics/f0/f0_rmse.py +++ b/evaluation/metrics/f0/f0_rmse.py @@ -65,12 +65,12 @@ def extract_f0rmse( f0_ref = get_f0_features_using_parselmouth( audio_ref, cfg, - )[0] + ) f0_deg = get_f0_features_using_parselmouth( audio_deg, cfg, - )[0] + ) # Subtract mean value from f0 if need_mean: diff --git a/evaluation/metrics/f0/v_uv_f1.py b/evaluation/metrics/f0/v_uv_f1.py index 24e73156..e9b79d55 100644 --- a/evaluation/metrics/f0/v_uv_f1.py +++ b/evaluation/metrics/f0/v_uv_f1.py @@ -64,12 +64,12 @@ def extract_f1_v_uv( f0_ref = get_f0_features_using_parselmouth( audio_ref, cfg, - )[0] + ) f0_deg = get_f0_features_using_parselmouth( audio_deg, cfg, - )[0] + ) # Avoid silence min_length = min(len(f0_ref), len(f0_deg)) diff --git a/models/base/base_dataset.py b/models/base/base_dataset.py index 3486edfd..8c1216a2 100644 --- a/models/base/base_dataset.py +++ b/models/base/base_dataset.py @@ -7,13 +7,15 @@ import numpy as np import torch.utils.data from torch.nn.utils.rnn import pad_sequence +import librosa + from utils.data_utils import * from processors.acoustic_extractor import cal_normalized_mel from text import text_to_sequence from text.text_token_collation import phoneIDCollation -class BaseDataset(torch.utils.data.Dataset): +class BaseOfflineDataset(torch.utils.data.Dataset): def __init__(self, cfg, dataset, is_valid=False): """ Args: @@ -269,7 +271,7 @@ def __len__(self): return len(self.metadata) -class BaseCollator(object): +class BaseOfflineCollator(object): """Zero-pads model inputs and targets based on number of frames per step""" def __init__(self, cfg): @@ -280,7 +282,7 @@ def __call__(self, batch): # mel: [b, T, n_mels] # frame_pitch, frame_energy: [1, T] - # target_len: [1] + # target_len: [b] # spk_id: [b, 1] # mask: [b, T, 1] @@ -320,6 +322,124 @@ def __call__(self, batch): return packed_batch_features +class BaseOnlineDataset(torch.utils.data.Dataset): + def __init__(self, cfg, dataset, is_valid=False): + """ + Args: + cfg: config + dataset: dataset name + is_valid: whether to use train or valid dataset + """ + assert isinstance(dataset, str) + + self.cfg = cfg + self.sample_rate = cfg.preprocess.sample_rate + self.hop_size = self.cfg.preprocess.hop_size + + processed_data_dir = os.path.join(cfg.preprocess.processed_dir, dataset) + meta_file = cfg.preprocess.valid_file if is_valid else cfg.preprocess.train_file + self.metafile_path = os.path.join(processed_data_dir, meta_file) + self.metadata = self.get_metadata() + + """ + load spk2id and utt2spk from json file + spk2id: {spk1: 0, spk2: 1, ...} + utt2spk: {dataset_uid: spk1, ...} + """ + if cfg.preprocess.use_spkid: + spk2id_path = os.path.join(processed_data_dir, cfg.preprocess.spk2id) + with open(spk2id_path, "r") as f: + self.spk2id = json.load(f) + + utt2spk_path = os.path.join(processed_data_dir, cfg.preprocess.utt2spk) + self.utt2spk = dict() + with open(utt2spk_path, "r") as f: + for line in f.readlines(): + utt, spk = line.strip().split("\t") + self.utt2spk[utt] = spk + + def get_metadata(self): + with open(self.metafile_path, "r", encoding="utf-8") as f: + metadata = json.load(f) + + return metadata + + def get_dataset_name(self): + return self.metadata[0]["Dataset"] + + def __getitem__(self, index): + """ + single_feature: + wav: (T) + wav_len: int + target_len: int + mask: (n_frames, 1) + spk_id: (1) + """ + utt_item = self.metadata[index] + + wav_path = utt_item["Path"] + wav, _ = librosa.load(wav_path, sr=self.sample_rate) + # wav: (T) + wav = torch.as_tensor(wav, dtype=torch.float32) + wav_len = len(wav) + # mask: (n_frames, 1) + frame_len = wav_len // self.hop_size + mask = torch.ones(frame_len, 1, dtype=torch.long) + + single_feature = { + "wav": wav, + "wav_len": wav_len, + "target_len": frame_len, + "mask": mask, + } + + if self.cfg.preprocess.use_spkid: + utt = "{}_{}".format(utt_item["Dataset"], utt_item["Uid"]) + single_feature["spk_id"] = torch.tensor( + [self.spk2id[self.utt2spk[utt]]], dtype=torch.int32 + ) + + return single_feature + + def __len__(self): + return len(self.metadata) + + +class BaseOnlineCollator(object): + """Zero-pads model inputs and targets based on number of frames per step (For on-the-fly features extraction, whose iterative item contains only wavs)""" + + def __init__(self, cfg): + self.cfg = cfg + + def __call__(self, batch): + """ + BaseOnlineDataset.__getitem__: + wav: (T,) + wav_len: int + target_len: int + mask: (n_frames, 1) + spk_id: (1) + + Returns: + wav: (B, T), torch.float32 + wav_len: (B), torch.long + target_len: (B), torch.long + mask: (B, n_frames, 1), torch.long + spk_id: (B, 1), torch.int32 + """ + packed_batch_features = dict() + + for key in batch[0].keys(): + if key in ["wav_len", "target_len"]: + packed_batch_features[key] = torch.LongTensor([b[key] for b in batch]) + else: + packed_batch_features[key] = pad_sequence( + [b[key] for b in batch], batch_first=True, padding_value=0 + ) + return packed_batch_features + + class BaseTestDataset(torch.utils.data.Dataset): def __init__(self, cfg, args): raise NotImplementedError diff --git a/models/base/base_sampler.py b/models/base/base_sampler.py index 593c91bc..e5e882ac 100644 --- a/models/base/base_sampler.py +++ b/models/base/base_sampler.py @@ -26,7 +26,7 @@ class ScheduledSampler(Sampler): Usage: For cfg.train.batch_size = 3, cfg.train.holistic_shuffle = False, cfg.train.drop_last = True: - >>> list(ScheduledSampler(ConcatDataset([0, 1, 2], [3, 4, 5], [6, 7, 8]]))) + >>> list(ScheduledSampler(ConcatDataset([[0, 1, 2], [3, 4, 5], [6, 7, 8]]))) [3, 4, 5, 0, 1, 2, 6, 7, 8] """ diff --git a/models/base/new_inference.py b/models/base/new_inference.py index 4813fca4..01dce86d 100644 --- a/models/base/new_inference.py +++ b/models/base/new_inference.py @@ -130,8 +130,12 @@ def _inference_each_batch(self, batch_data): def inference(self): for i, batch in enumerate(self.test_dataloader): y_pred = self._inference_each_batch(batch).cpu() - mel_min, mel_max = self.test_dataset.target_mel_extrema - y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min + + # Judge whether the min-max normliazation is used + if self.cfg.preprocess.use_min_max_norm_mel: + mel_min, mel_max = self.test_dataset.target_mel_extrema + y_pred = (y_pred + 1.0) / 2.0 * (mel_max - mel_min + EPS) + mel_min + y_ls = y_pred.chunk(self.test_batch_size) tgt_ls = batch["target_len"].cpu().chunk(self.test_batch_size) j = 0 diff --git a/models/base/new_trainer.py b/models/base/new_trainer.py index 4200e765..9bec327d 100644 --- a/models/base/new_trainer.py +++ b/models/base/new_trainer.py @@ -463,7 +463,6 @@ def _load_model( return checkpoint_path - # TODO: LEGACY CODE def _build_dataloader(self): Dataset, Collator = self._build_dataset() @@ -480,6 +479,7 @@ def _build_dataloader(self): # TODO: use config instead of (sampler, shuffle, drop_last, batch_size) train_loader = DataLoader( train_dataset, + # shuffle=True, collate_fn=train_collate, batch_sampler=batch_sampler, num_workers=self.cfg.train.dataloader.num_worker, @@ -514,41 +514,39 @@ def _set_random_seed(seed): def _check_nan(self, loss, y_pred, y_gt): if torch.any(torch.isnan(loss)): - self.logger.fatal("Fatal Error: Training is down since loss has Nan!") + self.logger.error("Fatal Error: Training is down since loss has Nan!") self.logger.error("loss = {:.6f}".format(loss.item()), in_order=True) + + ### y_pred ### if torch.any(torch.isnan(y_pred)): self.logger.error( f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True ) + self.logger.error(f"y_pred: {y_pred}", in_order=True) else: self.logger.debug( f"y_pred has Nan: {torch.any(torch.isnan(y_pred))}", in_order=True ) + self.logger.debug(f"y_pred: {y_pred}", in_order=True) + + ### y_gt ### if torch.any(torch.isnan(y_gt)): self.logger.error( f"y_gt has Nan: {torch.any(torch.isnan(y_gt))}", in_order=True ) + self.logger.error(f"y_gt: {y_gt}", in_order=True) else: self.logger.debug( f"y_gt has nan: {torch.any(torch.isnan(y_gt))}", in_order=True ) - if torch.any(torch.isnan(y_pred)): - self.logger.error(f"y_pred: {y_pred}", in_order=True) - else: - self.logger.debug(f"y_pred: {y_pred}", in_order=True) - if torch.any(torch.isnan(y_gt)): - self.logger.error(f"y_gt: {y_gt}", in_order=True) - else: self.logger.debug(f"y_gt: {y_gt}", in_order=True) - # TODO: still OK to save tracking? self.accelerator.end_training() raise RuntimeError("Loss has Nan! See log for more info.") ### Protected methods end ### ## Following are private methods ## - ## !!! These are inconvenient for GAN-based model training. It'd be better to move these to svc_trainer.py if needed. def _build_optimizer(self): r"""Build optimizer for model.""" # Make case-insensitive matching diff --git a/models/svc/base/svc_dataset.py b/models/svc/base/svc_dataset.py index c4f908f4..d8ef7d5a 100644 --- a/models/svc/base/svc_dataset.py +++ b/models/svc/base/svc_dataset.py @@ -9,6 +9,8 @@ import json import os import numpy as np +import librosa + from utils.data_utils import * from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema from processors.content_extractor import ( @@ -17,17 +19,19 @@ WenetExtractor, ) from models.base.base_dataset import ( - BaseCollator, - BaseDataset, + BaseOfflineDataset, + BaseOfflineCollator, + BaseOnlineDataset, + BaseOnlineCollator, ) from models.base.new_dataset import BaseTestDataset EPS = 1.0e-12 -class SVCDataset(BaseDataset): +class SVCOfflineDataset(BaseOfflineDataset): def __init__(self, cfg, dataset, is_valid=False): - BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid) + BaseOfflineDataset.__init__(self, cfg, dataset, is_valid=is_valid) cfg = self.cfg @@ -56,7 +60,7 @@ def __init__(self, cfg, dataset, is_valid=False): ) def __getitem__(self, index): - single_feature = BaseDataset.__getitem__(self, index) + single_feature = BaseOfflineDataset.__getitem__(self, index) utt_info = self.metadata[index] dataset = utt_info["Dataset"] @@ -65,15 +69,19 @@ def __getitem__(self, index): if self.cfg.model.condition_encoder.use_whisper: assert "target_len" in single_feature.keys() - aligned_whisper_feat = self.whisper_aligner.offline_align( - np.load(self.utt2whisper_path[utt]), single_feature["target_len"] + aligned_whisper_feat = ( + self.whisper_aligner.offline_resolution_transformation( + np.load(self.utt2whisper_path[utt]), single_feature["target_len"] + ) ) single_feature["whisper_feat"] = aligned_whisper_feat if self.cfg.model.condition_encoder.use_contentvec: assert "target_len" in single_feature.keys() - aligned_contentvec = self.contentvec_aligner.offline_align( - np.load(self.utt2contentVec_path[utt]), single_feature["target_len"] + aligned_contentvec = ( + self.contentvec_aligner.offline_resolution_transformation( + np.load(self.utt2contentVec_path[utt]), single_feature["target_len"] + ) ) single_feature["contentvec_feat"] = aligned_contentvec @@ -88,7 +96,7 @@ def __getitem__(self, index): if self.cfg.model.condition_encoder.use_wenet: assert "target_len" in single_feature.keys() - aligned_wenet_feat = self.wenet_aligner.offline_align( + aligned_wenet_feat = self.wenet_aligner.offline_resolution_transformation( np.load(self.utt2wenet_path[utt]), single_feature["target_len"] ) single_feature["wenet_feat"] = aligned_wenet_feat @@ -153,17 +161,154 @@ def clip_if_too_long(self, sample, max_seq_len=512): return sample -class SVCCollator(BaseCollator): - """Zero-pads model inputs and targets based on number of frames per step""" +class SVCOnlineDataset(BaseOnlineDataset): + def __init__(self, cfg, dataset, is_valid=False): + super().__init__(cfg, dataset, is_valid=is_valid) + + # Audio pretrained models' sample rates + self.all_sample_rates = {self.sample_rate} + if self.cfg.model.condition_encoder.use_whisper: + self.all_sample_rates.add(self.cfg.preprocess.whisper_sample_rate) + if self.cfg.model.condition_encoder.use_contentvec: + self.all_sample_rates.add(self.cfg.preprocess.contentvec_sample_rate) + if self.cfg.model.condition_encoder.use_wenet: + self.all_sample_rates.add(self.cfg.preprocess.wenet_sample_rate) + + self.highest_sample_rate = max(list(self.all_sample_rates)) + + # The maximum duration (seconds) for one training sample + self.max_duration = 6.0 + self.max_n_frames = int(self.max_duration * self.highest_sample_rate) + + def random_select(self, wav, duration, wav_path): + """ + wav: (T,) + """ + if duration <= self.max_duration: + return wav + + ts_frame = int((duration - self.max_duration) * self.highest_sample_rate) + start = random.randint(0, ts_frame) + end = start + self.max_n_frames + + if (wav[start:end] == 0).all(): + print("*" * 20) + print("Warning! The wav file {} has a lot of silience.".format(wav_path)) + + # There should be at least some frames that are not silience. Then we select them. + assert (wav != 0).any() + start = np.where(wav != 0)[0][0] + end = start + self.max_n_frames + + return wav[start:end] + + def __getitem__(self, index): + """ + single_feature: dict, + wav: (T,) + wav_len: int + target_len: int + mask: (n_frames, 1) + spk_id + + wav_{sr}: (T,) + wav_{sr}_len: int + """ + single_feature = dict() + + utt_item = self.metadata[index] + wav_path = utt_item["Path"] + + ### Use the highest sampling rate to load and randomly select ### + highest_sr_wav, _ = librosa.load(wav_path, sr=self.highest_sample_rate) + highest_sr_wav = self.random_select( + highest_sr_wav, utt_item["Duration"], wav_path + ) + + ### Waveforms under all the sample rates ### + for sr in self.all_sample_rates: + # Resample to the required sample rate + if sr != self.highest_sample_rate: + wav_sr = librosa.resample( + highest_sr_wav, orig_sr=self.highest_sample_rate, target_sr=sr + ) + else: + wav_sr = highest_sr_wav + + wav_sr = torch.as_tensor(wav_sr, dtype=torch.float32) + single_feature["wav_{}".format(sr)] = wav_sr + single_feature["wav_{}_len".format(sr)] = len(wav_sr) + + # For target sample rate + if sr == self.sample_rate: + wav_len = len(wav_sr) + frame_len = wav_len // self.hop_size + + single_feature["wav"] = wav_sr + single_feature["wav_len"] = wav_len + single_feature["target_len"] = frame_len + single_feature["mask"] = torch.ones(frame_len, 1, dtype=torch.long) + ### Speaker ID ### + if self.cfg.preprocess.use_spkid: + utt = "{}_{}".format(utt_item["Dataset"], utt_item["Uid"]) + single_feature["spk_id"] = torch.tensor( + [self.spk2id[self.utt2spk[utt]]], dtype=torch.int32 + ) + + return single_feature + + def __len__(self): + return len(self.metadata) + + +class SVCOfflineCollator(BaseOfflineCollator): def __init__(self, cfg): - BaseCollator.__init__(self, cfg) + super().__init__(cfg) def __call__(self, batch): - parsed_batch_features = BaseCollator.__call__(self, batch) + parsed_batch_features = super().__call__(batch) return parsed_batch_features +class SVCOnlineCollator(BaseOnlineCollator): + def __init__(self, cfg): + super().__init__(cfg) + + def __call__(self, batch): + """ + SVCOnlineDataset.__getitem__: + wav: (T,) + wav_len: int + target_len: int + mask: (n_frames, 1) + spk_id: (1) + + wav_{sr}: (T,) + wav_{sr}_len: int + + Returns: + wav: (B, T), torch.float32 + wav_len: (B), torch.long + target_len: (B), torch.long + mask: (B, n_frames, 1), torch.long + spk_id: (B, 1), torch.int32 + + wav_{sr}: (B, T) + wav_{sr}_len: (B), torch.long + """ + packed_batch_features = dict() + + for key in batch[0].keys(): + if "_len" in key: + packed_batch_features[key] = torch.LongTensor([b[key] for b in batch]) + else: + packed_batch_features[key] = pad_sequence( + [b[key] for b in batch], batch_first=True, padding_value=0 + ) + return packed_batch_features + + class SVCTestDataset(BaseTestDataset): def __init__(self, args, cfg, infer_type): BaseTestDataset.__init__(self, args, cfg, infer_type) @@ -179,9 +324,16 @@ def __init__(self, args, cfg, infer_type): "_{}".format(self.target_singer), "" ) if cfg.preprocess.mel_min_max_norm: - self.target_mel_extrema = load_mel_extrema( - cfg.preprocess, self.target_dataset - ) + if self.cfg.preprocess.features_extraction_mode == "online": + # TODO: Change the hard code + + # Using an empirical mel extrema to normalize + self.target_mel_extrema = load_mel_extrema(cfg.preprocess, "vctk") + else: + self.target_mel_extrema = load_mel_extrema( + cfg.preprocess, self.target_dataset + ) + self.target_mel_extrema = torch.as_tensor( self.target_mel_extrema[0] ), torch.as_tensor(self.target_mel_extrema[1]) @@ -370,15 +522,19 @@ def __getitem__(self, index): ######### Get Content Features Item ######### if self.cfg.model.condition_encoder.use_whisper: assert "target_len" in single_feature.keys() - aligned_whisper_feat = self.whisper_aligner.offline_align( - np.load(self.utt2whisper_path[utt]), single_feature["target_len"] + aligned_whisper_feat = ( + self.whisper_aligner.offline_resolution_transformation( + np.load(self.utt2whisper_path[utt]), single_feature["target_len"] + ) ) single_feature["whisper_feat"] = aligned_whisper_feat if self.cfg.model.condition_encoder.use_contentvec: assert "target_len" in single_feature.keys() - aligned_contentvec = self.contentvec_aligner.offline_align( - np.load(self.utt2contentVec_path[utt]), single_feature["target_len"] + aligned_contentvec = ( + self.contentvec_aligner.offline_resolution_transformation( + np.load(self.utt2contentVec_path[utt]), single_feature["target_len"] + ) ) single_feature["contentvec_feat"] = aligned_contentvec @@ -393,7 +549,7 @@ def __getitem__(self, index): if self.cfg.model.condition_encoder.use_wenet: assert "target_len" in single_feature.keys() - aligned_wenet_feat = self.wenet_aligner.offline_align( + aligned_wenet_feat = self.wenet_aligner.offline_resolution_transformation( np.load(self.utt2wenet_path[utt]), single_feature["target_len"] ) single_feature["wenet_feat"] = aligned_wenet_feat diff --git a/models/svc/base/svc_trainer.py b/models/svc/base/svc_trainer.py index 1c6588ed..f2d8f69c 100644 --- a/models/svc/base/svc_trainer.py +++ b/models/svc/base/svc_trainer.py @@ -8,9 +8,19 @@ import torch import torch.nn as nn +import numpy as np from models.base.new_trainer import BaseTrainer -from models.svc.base.svc_dataset import SVCCollator, SVCDataset +from models.svc.base.svc_dataset import ( + SVCOfflineCollator, + SVCOfflineDataset, + SVCOnlineCollator, + SVCOnlineDataset, +) +from processors.audio_features_extractor import AudioFeaturesExtractor +from processors.acoustic_extractor import cal_normalized_mel, load_mel_extrema + +EPS = 1.0e-12 class SVCTrainer(BaseTrainer): @@ -37,9 +47,151 @@ def __init__(self, args=None, cfg=None): self.logger.info("Task type: {}".format(self.task_type)) ### Following are methods only for SVC tasks ### - # TODO: LEGACY CODE, NEED TO BE REFACTORED def _build_dataset(self): - return SVCDataset, SVCCollator + self.online_features_extraction = ( + self.cfg.preprocess.features_extraction_mode == "online" + ) + + if not self.online_features_extraction: + return SVCOfflineDataset, SVCOfflineCollator + else: + self.audio_features_extractor = AudioFeaturesExtractor(self.cfg) + return SVCOnlineDataset, SVCOnlineCollator + + def _extract_svc_features(self, batch): + """ + Features extraction during training + + Batch: + wav: (B, T) + wav_len: (B) + target_len: (B) + mask: (B, n_frames, 1) + spk_id: (B, 1) + + wav_{sr}: (B, T) + wav_{sr}_len: (B) + + Added elements when output: + mel: (B, n_frames, n_mels) + frame_pitch: (B, n_frames) + frame_uv: (B, n_frames) + frame_energy: (B, n_frames) + frame_{content}: (B, n_frames, D) + """ + + padded_n_frames = torch.max(batch["target_len"]) + final_n_frames = padded_n_frames + + ### Mel Spectrogram ### + if self.cfg.preprocess.use_mel: + # (B, n_mels, n_frames) + raw_mel = self.audio_features_extractor.get_mel_spectrogram(batch["wav"]) + if self.cfg.preprocess.use_min_max_norm_mel: + # TODO: Change the hard code + + # Using the empirical mel extrema to denormalize + if not hasattr(self, "mel_extrema"): + # (n_mels) + m, M = load_mel_extrema(self.cfg.preprocess, "vctk") + # (1, n_mels, 1) + m = ( + torch.as_tensor(m, device=raw_mel.device) + .unsqueeze(0) + .unsqueeze(-1) + ) + M = ( + torch.as_tensor(M, device=raw_mel.device) + .unsqueeze(0) + .unsqueeze(-1) + ) + self.mel_extrema = m, M + + m, M = self.mel_extrema + mel = (raw_mel - m) / (M - m + EPS) * 2 - 1 + + else: + mel = raw_mel + + final_n_frames = min(final_n_frames, mel.size(-1)) + + # (B, n_frames, n_mels) + batch["mel"] = mel.transpose(1, 2) + else: + raw_mel = None + + ### F0 ### + if self.cfg.preprocess.use_frame_pitch: + # (B, n_frames) + raw_f0, raw_uv = self.audio_features_extractor.get_f0( + batch["wav"], + wav_lens=batch["wav_len"], + use_interpolate=self.cfg.preprocess.use_interpolation_for_uv, + return_uv=True, + ) + final_n_frames = min(final_n_frames, raw_f0.size(-1)) + batch["frame_pitch"] = raw_f0 + + if self.cfg.preprocess.use_uv: + batch["frame_uv"] = raw_uv + + ### Energy ### + if self.cfg.preprocess.use_frame_energy: + # (B, n_frames) + raw_energy = self.audio_features_extractor.get_energy( + batch["wav"], mel_spec=raw_mel + ) + final_n_frames = min(final_n_frames, raw_energy.size(-1)) + batch["frame_energy"] = raw_energy + + ### Semantic Features ### + if self.cfg.model.condition_encoder.use_whisper: + # (B, n_frames, D) + whisper_feats = self.audio_features_extractor.get_whisper_features( + wavs=batch["wav_{}".format(self.cfg.preprocess.whisper_sample_rate)], + target_frame_len=padded_n_frames, + ) + final_n_frames = min(final_n_frames, whisper_feats.size(1)) + batch["whisper_feat"] = whisper_feats + + if self.cfg.model.condition_encoder.use_contentvec: + # (B, n_frames, D) + contentvec_feats = self.audio_features_extractor.get_contentvec_features( + wavs=batch["wav_{}".format(self.cfg.preprocess.contentvec_sample_rate)], + target_frame_len=padded_n_frames, + ) + final_n_frames = min(final_n_frames, contentvec_feats.size(1)) + batch["contentvec_feat"] = contentvec_feats + + if self.cfg.model.condition_encoder.use_wenet: + # (B, n_frames, D) + wenet_feats = self.audio_features_extractor.get_wenet_features( + wavs=batch["wav_{}".format(self.cfg.preprocess.wenet_sample_rate)], + target_frame_len=padded_n_frames, + wav_lens=batch[ + "wav_{}_len".format(self.cfg.preprocess.wenet_sample_rate) + ], + ) + final_n_frames = min(final_n_frames, wenet_feats.size(1)) + batch["wenet_feat"] = wenet_feats + + ### Align all the audio features to the same frame length ### + frame_level_features = [ + "mask", + "mel", + "frame_pitch", + "frame_uv", + "frame_energy", + "whisper_feat", + "contentvec_feat", + "wenet_feat", + ] + for k in frame_level_features: + if k in batch: + # (B, n_frames, ...) + batch[k] = batch[k][:, :final_n_frames].contiguous() + + return batch @staticmethod def _build_criterion(): @@ -51,15 +203,15 @@ def _compute_loss(criterion, y_pred, y_gt, loss_mask): """ Args: criterion: MSELoss(reduction='none') - y_pred, y_gt: (bs, seq_len, D) - loss_mask: (bs, seq_len, 1) + y_pred, y_gt: (B, seq_len, D) + loss_mask: (B, seq_len, 1) Returns: loss: Tensor of shape [] """ - # (bs, seq_len, D) + # (B, seq_len, D) loss = criterion(y_pred, y_gt) - # expand loss_mask to (bs, seq_len, D) + # expand loss_mask to (B, seq_len, D) loss_mask = loss_mask.repeat(1, 1, loss.shape[-1]) loss = torch.sum(loss * loss_mask) / torch.sum(loss_mask) diff --git a/models/svc/diffusion/diffusion_trainer.py b/models/svc/diffusion/diffusion_trainer.py index 6f5aeb56..f1c59f21 100644 --- a/models/svc/diffusion/diffusion_trainer.py +++ b/models/svc/diffusion/diffusion_trainer.py @@ -57,13 +57,29 @@ def count_parameters(self, model): model_param = sum(p.numel() for p in model.parameters()) return model_param + def _check_nan(self, batch, loss, y_pred, y_gt): + if torch.any(torch.isnan(loss)): + for k, v in batch.items(): + self.logger.info(k) + self.logger.info(v) + + super()._check_nan(loss, y_pred, y_gt) + def _forward_step(self, batch): r"""Forward step for training and inference. This function is called in ``_train_step`` & ``_test_step`` function. """ - device = self.accelerator.device + if self.online_features_extraction: + # On-the-fly features extraction + batch = self._extract_svc_features(batch) + + # To debug + # for k, v in batch.items(): + # print(k, v.shape, v) + # exit() + mel_input = batch["mel"] noise = torch.randn_like(mel_input, device=device, dtype=torch.float32) batch_size = mel_input.size(0) @@ -80,9 +96,7 @@ def _forward_step(self, batch): y_pred = self.acoustic_mapper(noisy_mel, timesteps, conditioner) - # TODO: Predict noise or gt should be configurable loss = self._compute_loss(self.criterion, y_pred, noise, batch["mask"]) - self._check_nan(loss, y_pred, noise) + self._check_nan(batch, loss, y_pred, noise) - # FIXME: Clarify that we should not divide it with batch size here return loss diff --git a/models/svc/vits/vits.py b/models/svc/vits/vits.py index baa857ee..983a704e 100644 --- a/models/svc/vits/vits.py +++ b/models/svc/vits/vits.py @@ -10,7 +10,6 @@ from torch.nn import functional as F from utils.util import * -from utils.f0 import f0_to_coarse from modules.transformer.attentions import Encoder from models.tts.vits.vits import ResidualCouplingBlock, PosteriorEncoder diff --git a/models/svc/vits/vits_trainer.py b/models/svc/vits/vits_trainer.py index 766a5faf..8be6d374 100644 --- a/models/svc/vits/vits_trainer.py +++ b/models/svc/vits/vits_trainer.py @@ -11,7 +11,7 @@ import accelerate # from models.svc.base import SVCTrainer -from models.svc.base.svc_dataset import SVCCollator, SVCDataset +from models.svc.base.svc_dataset import SVCOfflineCollator, SVCOfflineDataset from models.svc.vits.vits import * from models.svc.base import SVCTrainer @@ -110,7 +110,7 @@ def _build_model(self): return model def _build_dataset(self): - return SVCDataset, SVCCollator + return SVCOfflineDataset, SVCOfflineCollator def _build_optimizer(self): optimizer_g = torch.optim.AdamW( diff --git a/models/tta/autoencoder/autoencoder_dataset.py b/models/tta/autoencoder/autoencoder_dataset.py index 764008c8..8b9b4bdf 100644 --- a/models/tta/autoencoder/autoencoder_dataset.py +++ b/models/tta/autoencoder/autoencoder_dataset.py @@ -8,17 +8,17 @@ from torch.nn.utils.rnn import pad_sequence from utils.data_utils import * from models.base.base_dataset import ( - BaseCollator, - BaseDataset, + BaseOfflineCollator, + BaseOfflineDataset, BaseTestDataset, BaseTestCollator, ) import librosa -class AutoencoderKLDataset(BaseDataset): +class AutoencoderKLDataset(BaseOfflineDataset): def __init__(self, cfg, dataset, is_valid=False): - BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid) + BaseOfflineDataset.__init__(self, cfg, dataset, is_valid=is_valid) cfg = self.cfg @@ -56,7 +56,7 @@ def __getitem__(self, index): # melspec: (n_mels, T) # wav: (T,) - single_feature = BaseDataset.__getitem__(self, index) + single_feature = BaseOfflineDataset.__getitem__(self, index) utt_info = self.metadata[index] dataset = utt_info["Dataset"] @@ -81,9 +81,9 @@ def __len__(self): return len(self.metadata) -class AutoencoderKLCollator(BaseCollator): +class AutoencoderKLCollator(BaseOfflineCollator): def __init__(self, cfg): - BaseCollator.__init__(self, cfg) + BaseOfflineCollator.__init__(self, cfg) def __call__(self, batch): # mel: (B, n_mels, T) diff --git a/models/tta/ldm/audioldm_dataset.py b/models/tta/ldm/audioldm_dataset.py index 2bb176b9..96232536 100644 --- a/models/tta/ldm/audioldm_dataset.py +++ b/models/tta/ldm/audioldm_dataset.py @@ -10,8 +10,8 @@ from models.base.base_dataset import ( - BaseCollator, - BaseDataset, + BaseOfflineCollator, + BaseOfflineDataset, BaseTestDataset, BaseTestCollator, ) @@ -20,9 +20,9 @@ from transformers import AutoTokenizer -class AudioLDMDataset(BaseDataset): +class AudioLDMDataset(BaseOfflineDataset): def __init__(self, cfg, dataset, is_valid=False): - BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid) + BaseOfflineDataset.__init__(self, cfg, dataset, is_valid=is_valid) self.cfg = cfg @@ -70,7 +70,7 @@ def __getitem__(self, index): # melspec: (n_mels, T) # wav: (T,) - single_feature = BaseDataset.__getitem__(self, index) + single_feature = BaseOfflineDataset.__getitem__(self, index) utt_info = self.metadata[index] dataset = utt_info["Dataset"] @@ -105,9 +105,9 @@ def __len__(self): return len(self.metadata) -class AudioLDMCollator(BaseCollator): +class AudioLDMCollator(BaseOfflineCollator): def __init__(self, cfg): - BaseCollator.__init__(self, cfg) + BaseOfflineCollator.__init__(self, cfg) self.tokenizer = AutoTokenizer.from_pretrained("t5-base", model_max_length=512) diff --git a/models/tts/base/tts_dataset.py b/models/tts/base/tts_dataset.py index fc85afb9..dca129e2 100644 --- a/models/tts/base/tts_dataset.py +++ b/models/tts/base/tts_dataset.py @@ -15,8 +15,8 @@ from processors.acoustic_extractor import cal_normalized_mel from models.base.base_dataset import ( - BaseDataset, - BaseCollator, + BaseOfflineDataset, + BaseOfflineCollator, BaseTestDataset, BaseTestCollator, ) @@ -28,7 +28,7 @@ ) -class TTSDataset(BaseDataset): +class TTSDataset(BaseOfflineDataset): def __init__(self, cfg, dataset, is_valid=False): """ Args: @@ -294,7 +294,7 @@ def get_metadata(self): return super().get_metadata() -class TTSCollator(BaseCollator): +class TTSCollator(BaseOfflineCollator): """Zero-pads model inputs and targets based on number of frames per step""" def __init__(self, cfg): diff --git a/models/tts/fastspeech2/fs2_dataset.py b/models/tts/fastspeech2/fs2_dataset.py index 9d019aee..dae5b045 100644 --- a/models/tts/fastspeech2/fs2_dataset.py +++ b/models/tts/fastspeech2/fs2_dataset.py @@ -8,17 +8,17 @@ from torch.nn.utils.rnn import pad_sequence from utils.data_utils import * from models.base.base_dataset import ( - BaseCollator, - BaseDataset, + BaseOfflineCollator, + BaseOfflineDataset, BaseTestDataset, BaseTestCollator, ) from text import text_to_sequence -class FS2Dataset(BaseDataset): +class FS2Dataset(BaseOfflineDataset): def __init__(self, cfg, dataset, is_valid=False): - BaseDataset.__init__(self, cfg, dataset, is_valid=is_valid) + BaseOfflineDataset.__init__(self, cfg, dataset, is_valid=is_valid) self.batch_size = cfg.train.batch_size cfg = cfg.preprocess # utt2duration @@ -99,7 +99,7 @@ def __init__(self, cfg, dataset, is_valid=False): self.metadata = self.check_metadata() def __getitem__(self, index): - single_feature = BaseDataset.__getitem__(self, index) + single_feature = BaseOfflineDataset.__getitem__(self, index) utt_info = self.metadata[index] dataset = utt_info["Dataset"] @@ -221,11 +221,11 @@ def check_metadata(self): return new_metadata -class FS2Collator(BaseCollator): +class FS2Collator(BaseOfflineCollator): """Zero-pads model inputs and targets based on number of frames per step""" def __init__(self, cfg): - BaseCollator.__init__(self, cfg) + BaseOfflineCollator.__init__(self, cfg) self.sort = cfg.train.sort_sample self.batch_size = cfg.train.batch_size self.drop_last = cfg.train.drop_last diff --git a/models/tts/naturalspeech2/ns2_dataset.py b/models/tts/naturalspeech2/ns2_dataset.py index cb7d9183..df10f3fa 100644 --- a/models/tts/naturalspeech2/ns2_dataset.py +++ b/models/tts/naturalspeech2/ns2_dataset.py @@ -10,8 +10,8 @@ from processors.acoustic_extractor import cal_normalized_mel from processors.acoustic_extractor import load_normalized from models.base.base_dataset import ( - BaseCollator, - BaseDataset, + BaseOfflineCollator, + BaseOfflineDataset, BaseTestDataset, BaseTestCollator, ) @@ -355,9 +355,9 @@ def get_target_and_reference(self, code, pitch, duration, phone_id, frame_nums): } -class NS2Collator(BaseCollator): +class NS2Collator(BaseOfflineCollator): def __init__(self, cfg): - BaseCollator.__init__(self, cfg) + BaseOfflineCollator.__init__(self, cfg) def __call__(self, batch): packed_batch_features = dict() diff --git a/modules/encoder/condition_encoder.py b/modules/encoder/condition_encoder.py index 88ee892e..1600d078 100644 --- a/modules/encoder/condition_encoder.py +++ b/modules/encoder/condition_encoder.py @@ -52,8 +52,6 @@ def __init__(self, cfg): self.input_dim = self.cfg.input_melody_dim self.output_dim = self.cfg.output_melody_dim self.n_bins = self.cfg.n_bins_melody - self.pitch_min = self.cfg.pitch_min - self.pitch_max = self.cfg.pitch_max if self.input_dim != 0: if self.n_bins == 0: @@ -69,26 +67,18 @@ def __init__(self, cfg): padding_idx=None, ) self.uv_embedding = nn.Embedding(2, self.output_dim) - # self.conformer = Conformer( - # input_dim=self.output_dim, - # num_heads=4, - # ffn_dim=128, - # num_layers=4, - # depthwise_conv_kernel_size=3, - # ) def forward(self, x, uv=None, length=None): - # x: (N, frame_len) - # print(x.shape) + # x: (B, frame_len) if self.n_bins == 0: x = x.unsqueeze(-1) else: x = f0_to_coarse(x, self.n_bins, self.f0_min, self.f0_max) x = self.nn(x) - if uv is not None: + + if self.cfg.use_uv: uv = self.uv_embedding(uv) x = x + uv - # x, _ = self.conformer(x, length) return x @@ -106,21 +96,19 @@ def __init__(self, cfg): # Not use quantization self.nn = nn.Linear(self.input_dim, self.output_dim) else: - # TODO: set trivially now + # TODO: set empirically now self.loudness_min = 1e-30 self.loudness_max = 1.5 - - if cfg.use_log_loudness: - self.energy_bins = nn.Parameter( - torch.exp( - torch.linspace( - np.log(self.loudness_min), - np.log(self.loudness_max), - self.n_bins - 1, - ) - ), - requires_grad=False, - ) + self.energy_bins = nn.Parameter( + torch.exp( + torch.linspace( + np.log(self.loudness_min), + np.log(self.loudness_max), + self.n_bins - 1, + ) + ), + requires_grad=False, + ) self.nn = nn.Embedding( num_embeddings=self.n_bins, @@ -160,50 +148,55 @@ class ConditionEncoder(nn.Module): def __init__(self, cfg): super().__init__() self.cfg = cfg - self.merge_mode = cfg.merge_mode + ### Semantic Features ### if cfg.use_whisper: self.whisper_encoder = ContentEncoder( self.cfg, self.cfg.whisper_dim, self.cfg.content_encoder_dim ) - if cfg.use_contentvec: self.contentvec_encoder = ContentEncoder( self.cfg, self.cfg.contentvec_dim, self.cfg.content_encoder_dim ) - if cfg.use_mert: self.mert_encoder = ContentEncoder( self.cfg, self.cfg.mert_dim, self.cfg.content_encoder_dim ) - if cfg.use_wenet: self.wenet_encoder = ContentEncoder( self.cfg, self.cfg.wenet_dim, self.cfg.content_encoder_dim ) - self.melody_encoder = MelodyEncoder(self.cfg) - self.loudness_encoder = LoudnessEncoder(self.cfg) + ### Prosody Features ### + if cfg.use_f0: + self.melody_encoder = MelodyEncoder(self.cfg) + if cfg.use_energy: + self.loudness_encoder = LoudnessEncoder(self.cfg) + + ### Speaker Features ### if cfg.use_spkid: self.singer_encoder = SingerEncoder(self.cfg) def forward(self, x): outputs = [] - if "frame_pitch" in x.keys(): - if "frame_uv" not in x.keys(): - x["frame_uv"] = None - pitch_enc_out = self.melody_encoder( - x["frame_pitch"], uv=x["frame_uv"], length=x["target_len"] - ) + if self.cfg.use_f0: + if self.cfg.use_uv: + pitch_enc_out = self.melody_encoder( + x["frame_pitch"], uv=x["frame_uv"], length=x["target_len"] + ) + else: + pitch_enc_out = self.melody_encoder( + x["frame_pitch"], uv=None, length=x["target_len"] + ) outputs.append(pitch_enc_out) - if "frame_energy" in x.keys(): + if self.cfg.use_energy: loudness_enc_out = self.loudness_encoder(x["frame_energy"]) outputs.append(loudness_enc_out) - if "whisper_feat" in x.keys(): + if self.cfg.use_whisper: # whisper_feat: [b, T, 1024] whiser_enc_out = self.whisper_encoder( x["whisper_feat"], length=x["target_len"] @@ -211,24 +204,24 @@ def forward(self, x): outputs.append(whiser_enc_out) seq_len = whiser_enc_out.shape[1] - if "contentvec_feat" in x.keys(): + if self.cfg.use_contentvec: contentvec_enc_out = self.contentvec_encoder( x["contentvec_feat"], length=x["target_len"] ) outputs.append(contentvec_enc_out) seq_len = contentvec_enc_out.shape[1] - if "mert_feat" in x.keys(): + if self.cfg.use_mert: mert_enc_out = self.mert_encoder(x["mert_feat"], length=x["target_len"]) outputs.append(mert_enc_out) seq_len = mert_enc_out.shape[1] - if "wenet_feat" in x.keys(): + if self.cfg.use_wenet: wenet_enc_out = self.wenet_encoder(x["wenet_feat"], length=x["target_len"]) outputs.append(wenet_enc_out) seq_len = wenet_enc_out.shape[1] - if "spk_id" in x.keys(): + if self.cfg.use_spkid: speaker_enc_out = self.singer_encoder(x["spk_id"]) # [b, 1, 384] assert ( "whisper_feat" in x.keys() diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py index 25110e59..9c4d9be7 100644 --- a/processors/acoustic_extractor.py +++ b/processors/acoustic_extractor.py @@ -450,6 +450,10 @@ def extract_utt_acoustic_features_vocoder(dataset_output, cfg, utt): def cal_normalized_mel(mel, dataset_name, cfg): + """ + mel: (n_mels, T) + """ + # mel_min, mel_max: (n_mels) mel_min, mel_max = load_mel_extrema(cfg, dataset_name) mel_norm = normalize_mel_channel(mel, mel_min, mel_max) return mel_norm @@ -529,6 +533,10 @@ def denormalize_mel_channel(mel, mel_min, mel_max): def normalize_mel_channel(mel, mel_min, mel_max): + """ + mel: (n_mels, T) + mel_min, mel_max: (n_mels) + """ mel_min = np.expand_dims(mel_min, -1) mel_max = np.expand_dims(mel_max, -1) return (mel - mel_min) / (mel_max - mel_min + ZERO) * 2 - 1 diff --git a/processors/audio_features_extractor.py b/processors/audio_features_extractor.py new file mode 100644 index 00000000..8e38bd5e --- /dev/null +++ b/processors/audio_features_extractor.py @@ -0,0 +1,157 @@ +# Copyright (c) 2023 Amphion. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" + +This module aims to be an entrance that integrates all the functions for extracting features from raw audio. + +The common audio features include: +1. Acoustic features such as Mel Spectrogram, F0, Energy, etc. +2. Content features such as phonetic posteriorgrams (PPG) and bottleneck features (BNF) from pretrained models + +Note: +All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset. + +""" + +import torch +from torch.nn.utils.rnn import pad_sequence + +from utils.mel import extract_mel_features +from utils.f0 import get_f0 as extract_f0_features +from processors.content_extractor import ( + WhisperExtractor, + ContentvecExtractor, + WenetExtractor, +) + + +class AudioFeaturesExtractor: + def __init__(self, cfg): + """ + Args: + cfg: Amphion config that would be used to specify the processing parameters + """ + self.cfg = cfg + + def get_mel_spectrogram(self, wavs): + """Get Mel Spectrogram Features + + Args: + wavs: Tensor whose shape is (B, T) + + Returns: + Tensor whose shape is (B, n_mels, n_frames) + """ + return extract_mel_features(y=wavs, cfg=self.cfg.preprocess) + + def get_f0(self, wavs, wav_lens=None, use_interpolate=False, return_uv=False): + """Get F0 Features + + Args: + wavs: Tensor whose shape is (B, T) + + Returns: + Tensor whose shape is (B, n_frames) + """ + device = wavs.device + + f0s = [] + uvs = [] + for i, w in enumerate(wavs): + if wav_lens is not None: + w = w[: wav_lens[i]] + + f0, uv = extract_f0_features( + # Use numpy to extract + w.cpu().numpy(), + self.cfg.preprocess, + use_interpolate=use_interpolate, + return_uv=True, + ) + f0s.append(torch.as_tensor(f0, device=device)) + uvs.append(torch.as_tensor(uv, device=device, dtype=torch.long)) + + # (B, n_frames) + f0s = pad_sequence(f0s, batch_first=True, padding_value=0) + uvs = pad_sequence(uvs, batch_first=True, padding_value=0) + + if return_uv: + return f0s, uvs + + return f0s + + def get_energy(self, wavs, mel_spec=None): + """Get Energy Features + + Args: + wavs: Tensor whose shape is (B, T) + mel_spec: Tensor whose shape is (B, n_mels, n_frames) + + Returns: + Tensor whose shape is (B, n_frames) + """ + if mel_spec is None: + mel_spec = self.get_mel_spectrogram(wavs) + + energies = (mel_spec.exp() ** 2).sum(dim=1).sqrt() + return energies + + def get_whisper_features(self, wavs, target_frame_len): + """Get Whisper Features + + Args: + wavs: Tensor whose shape is (B, T) + target_frame_len: int + + Returns: + Tensor whose shape is (B, target_frame_len, D) + """ + if not hasattr(self, "whisper_extractor"): + self.whisper_extractor = WhisperExtractor(self.cfg) + self.whisper_extractor.load_model() + + whisper_feats = self.whisper_extractor.extract_content_features(wavs) + whisper_feats = self.whisper_extractor.ReTrans(whisper_feats, target_frame_len) + return whisper_feats + + def get_contentvec_features(self, wavs, target_frame_len): + """Get ContentVec Features + + Args: + wavs: Tensor whose shape is (B, T) + target_frame_len: int + + Returns: + Tensor whose shape is (B, target_frame_len, D) + """ + if not hasattr(self, "contentvec_extractor"): + self.contentvec_extractor = ContentvecExtractor(self.cfg) + self.contentvec_extractor.load_model() + + contentvec_feats = self.contentvec_extractor.extract_content_features(wavs) + contentvec_feats = self.contentvec_extractor.ReTrans( + contentvec_feats, target_frame_len + ) + return contentvec_feats + + def get_wenet_features(self, wavs, target_frame_len, wav_lens=None): + """Get WeNet Features + + Args: + wavs: Tensor whose shape is (B, T) + target_frame_len: int + wav_lens: Tensor whose shape is (B) + + Returns: + Tensor whose shape is (B, target_frame_len, D) + """ + if not hasattr(self, "wenet_extractor"): + self.wenet_extractor = WenetExtractor(self.cfg) + self.wenet_extractor.load_model() + + wenet_feats = self.wenet_extractor.extract_content_features(wavs, lens=wav_lens) + wenet_feats = self.wenet_extractor.ReTrans(wenet_feats, target_frame_len) + return wenet_feats diff --git a/processors/content_extractor.py b/processors/content_extractor.py index a116e37f..f2353009 100644 --- a/processors/content_extractor.py +++ b/processors/content_extractor.py @@ -58,20 +58,14 @@ """ -class BaseExtractor: - def __init__(self, cfg): +class AudioPretrainedModelFeaturesExtractor: + def __init__(self, cfg, extractor_type): self.cfg = cfg - self.extractor_type = None + self.extractor_type = extractor_type self.model = None + self.init_for_retrans() - def offline_align(self, content, target_len): - """ - args: - content: (source_len, dim) - target_len: target length - return: - mapped_feature: (target_len, dim) - """ + def init_for_retrans(self): target_hop = self.cfg.preprocess.hop_size assert self.extractor_type in ["whisper", "contentvec", "wenet"] @@ -97,6 +91,20 @@ def offline_align(self, content, target_len): source_hop //= factor target_hop //= factor + self.source_hop = source_hop + self.target_hop = target_hop + + def offline_resolution_transformation(self, content, target_len): + """ + args: + content: (source_len, dim) + target_len: target length + return: + mapped_feature: (target_len, dim) + """ + source_hop = self.source_hop + target_hop = self.target_hop + # (source_len, 256) _, width = content.shape # slice the content from padded feature @@ -139,20 +147,73 @@ def offline_align(self, content, target_len): return mapped_feature - def save_feature(self, utt, content_feature): - """Save a single utternace to path {cfg.preprocess.processed_dir} + def log_for_ReTrans(self, err): + err_log_dir = os.path.join( + self.cfg.preprocess.processed_dir, "align_max_err.log" + ) + try: + with open(err_log_dir, "r") as f: + err_num = int(f.read()) + except: + with open(err_log_dir, "w") as f: + f.write("0") + err_num = 0 + if err > err_num: + with open(err_log_dir, "w") as f: + f.write(str(err)) + + def ReTrans(self, source_feats, padded_target_len): + """ + Resolution Transformation for mismatched frames alginment. - Args: - utt (dict): one item in metadata, containing information for one utterance - content_feature (tensor): content feature of one utterance + TODO: Merge the offline resolution_transformation into one + + args: + source_feats: Tensor, (B, padded_source_len, D) + padded_target_len: int, the maximum target length in a batch + return: + mapped_feature: Tensor, (B, padded_target_len, D) """ - uid = utt["Uid"] - assert self.extractor_type != None - out_dir = os.path.join( - self.cfg.preprocess.processed_dir, utt["Dataset"], self.extractor_type + source_hop = self.source_hop + target_hop = self.target_hop + + # (B, padded_source_len, D) + B, padded_source_len, D = source_feats.shape + + # select the valid content from padded feature + source_len = min( + padded_target_len * target_hop // source_hop + 1, padded_source_len ) - os.makedirs(out_dir, exist_ok=True) - save_path = os.path.join(out_dir, uid + ".npy") + + # const ~= padded_target_len * target_hop (padded wav's duration) + const = source_len * source_hop // target_hop * target_hop + + # (B, padded_source_len, D) -> (B, padded_source_len * source_hop, D) -> (B, const, D) + up_sampling_feats = torch.repeat_interleave(source_feats, source_hop, dim=1)[ + :, :const + ] + # (B, const, D) -> (B, const/target_hop, target_hop, D) -> (B, const/target_hop, D) + down_sampling_feats = torch.mean( + up_sampling_feats.reshape(B, -1, target_hop, D), dim=2 + ) + + err = abs(padded_target_len - down_sampling_feats.shape[1]) + if err > 8: + self.log_for_ReTrans(err) + + if down_sampling_feats.shape[1] < padded_target_len: + # (B, 1, D) -> (B, err, D) + end = down_sampling_feats[:, -1, :][:, None, :].repeat_interleave( + err, dim=1 + ) + # -> (B, padded_target_len, D) + down_sampling_feats = torch.cat([down_sampling_feats, end], dim=1) + + # (B, padded_target_len, D) + mapped_feature = down_sampling_feats[:, :padded_target_len] + return mapped_feature + + def get_valid_features(self, utt, content_feature): # only keep effective parts duration = utt["Duration"] if self.extractor_type == "whisper": @@ -171,20 +232,37 @@ def save_feature(self, utt, content_feature): frameshift = self.cfg.preprocess.mert_frameshift else: raise NotImplementedError + # calculate the number of valid frames num_frames = int(np.ceil((duration - frameshift) / frameshift)) + 1 - # (num_frames, dim) -> (valid_frames, dim) assert ( len(content_feature.shape) == 2 ), "content feature shape error, it should be (num_frames, dim)" content_feature = content_feature[:num_frames, :] + return content_feature + + def save_feature(self, utt, content_feature): + """Save a single utternace to path {cfg.preprocess.processed_dir} + + Args: + utt (dict): one item in metadata, containing information for one utterance + content_feature (tensor): content feature of one utterance + """ + uid = utt["Uid"] + assert self.extractor_type != None + out_dir = os.path.join( + self.cfg.preprocess.processed_dir, utt["Dataset"], self.extractor_type + ) + os.makedirs(out_dir, exist_ok=True) + save_path = os.path.join(out_dir, uid + ".npy") + + content_feature = self.get_valid_features(utt, content_feature) np.save(save_path, content_feature.cpu().detach().numpy()) -class WhisperExtractor(BaseExtractor): +class WhisperExtractor(AudioPretrainedModelFeaturesExtractor): def __init__(self, config): - super(WhisperExtractor, self).__init__(config) - self.extractor_type = "whisper" + super(WhisperExtractor, self).__init__(config, extractor_type="whisper") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def load_model(self): @@ -217,11 +295,10 @@ def load_model(self): self.model = model.eval() - def extract_content_features(self, wavs, lens): + def extract_content_features(self, wavs): """extract content features from a batch of dataloader Args: wavs: tensor (batch_size, T) - lens: list """ # wavs: (batch, max_len) wavs = whisper.pad_or_trim(wavs) @@ -233,10 +310,9 @@ def extract_content_features(self, wavs, lens): return features -class ContentvecExtractor(BaseExtractor): +class ContentvecExtractor(AudioPretrainedModelFeaturesExtractor): def __init__(self, cfg): - super(ContentvecExtractor, self).__init__(cfg) - self.extractor_type = "contentvec" + super(ContentvecExtractor, self).__init__(cfg, extractor_type="contentvec") def load_model(self): assert self.model == None @@ -257,11 +333,10 @@ def load_model(self): self.model = model - def extract_content_features(self, wavs, lens): + def extract_content_features(self, wavs): """extract content features from a batch of dataloader Args: wavs: tensor (batch, T) - lens: list """ device = next(self.model.parameters()).device wavs = wavs.to(device) # (batch, max_len) @@ -275,10 +350,9 @@ def extract_content_features(self, wavs, lens): return feats -class WenetExtractor(BaseExtractor): +class WenetExtractor(AudioPretrainedModelFeaturesExtractor): def __init__(self, config): - super(WenetExtractor, self).__init__(config) - self.extractor_type = "wenet" + super(WenetExtractor, self).__init__(config, extractor_type="wenet") def load_model(self): wenet_cfg = self.cfg.preprocess.wenet_config @@ -302,7 +376,7 @@ def load_model(self): def extract_content_features(self, wavs, lens): """extract content features from a batch of dataloader Args: - wavs: tensor + wavs: tensor, whose shape is (B, T) lens: list """ feats_list = [] @@ -365,10 +439,9 @@ def extract_content_features(self, wavs, lens): return features -class MertExtractor(BaseExtractor): +class MertExtractor(AudioPretrainedModelFeaturesExtractor): def __init__(self, cfg): - super(MertExtractor, self).__init__(cfg) - self.extractor_type = "mert" + super(MertExtractor, self).__init__(cfg, extractor_type="mert") self.preprocessor = None def load_model(self): @@ -389,11 +462,10 @@ def load_model(self): self.model = model self.preprocessor = preprocessor - def extract_content_features(self, wavs, lens): + def extract_content_features(self, wavs): """extract content features from a batch of dataloader Args: wavs: tensor (batch, T) - lens: list """ with torch.no_grad(): sample_rate = self.preprocessor.sampling_rate diff --git a/processors/descriptive_text_features_extractor.py b/processors/descriptive_text_features_extractor.py new file mode 100644 index 00000000..2e9cf05b --- /dev/null +++ b/processors/descriptive_text_features_extractor.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023 Amphion. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +TODO: + +This module aims to be an entrance that integrates all the "descriptive text" features extraction functions. + +The common descriptive text features include: +1. Global semantic guidance features that extracted some pretrained text models like T5. It can be adopted to TTA, TTM, etc. + +Note: +All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset. + +""" diff --git a/processors/text_features_extractor.py b/processors/text_features_extractor.py new file mode 100644 index 00000000..b67f3dec --- /dev/null +++ b/processors/text_features_extractor.py @@ -0,0 +1,17 @@ +# Copyright (c) 2023 Amphion. + +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +TODO: + +This module aims to be an entrance that integrates all the "text" features extraction functions. + +The common text features include: +1. phone features that are used for TTS, SVS, etc. + +Note: +All the features extraction are designed to utilize GPU to the maximum extent, which can ease the on-the-fly extraction for large-scale dataset. + +""" diff --git a/utils/f0.py b/utils/f0.py index fcd95396..169b1403 100644 --- a/utils/f0.py +++ b/utils/f0.py @@ -11,45 +11,24 @@ import pyworld as pw -def get_bin_index(f0, m, M, n_bins, use_log_scale): +def f0_to_coarse(f0, pitch_bin, f0_min, f0_max): """ - WARNING: to abandon! + Convert f0 (Hz) to pitch (mel scale), and then quantize the mel-scale pitch to the + range from [1, 2, 3, ..., pitch_bin-1] + + Reference: https://en.wikipedia.org/wiki/Mel_scale Args: - raw_f0: tensor whose shpae is (N, frame_len) + f0 (array or Tensor): Hz + pitch_bin (int): the vocabulary size + f0_min (int): the minimum f0 (Hz) + f0_max (int): the maximum f0 (Hz) + Returns: - index: tensor whose shape is same to f0 + quantized f0 (array or Tensor) """ - raw_f0 = f0.clone() - raw_m, raw_M = m, M - - if use_log_scale: - f0[torch.where(f0 == 0)] = 1 - f0 = torch.log(f0) - m, M = float(np.log(m)), float(np.log(M)) - - # Set normal index in [1, n_bins - 1] - width = (M + 1e-7 - m) / (n_bins - 1) - index = (f0 - m) // width + 1 - # Set unvoiced frames as 0, Therefore, the vocabulary is [0, n_bins- 1], whose size is n_bins - index[torch.where(f0 == 0)] = 0 - - # TODO: Boundary check (special: to judge whether 0 for unvoiced) - if torch.any(raw_f0 > raw_M): - print("F0 Warning: too high f0: {}".format(raw_f0[torch.where(raw_f0 > raw_M)])) - index[torch.where(raw_f0 > raw_M)] = n_bins - 1 - if torch.any(raw_f0 < raw_m): - print("F0 Warning: too low f0: {}".format(raw_f0[torch.where(f0 < m)])) - index[torch.where(f0 < m)] = 0 - - return torch.as_tensor(index, dtype=torch.long, device=f0.device) - - -def f0_to_coarse(f0, pitch_bin, pitch_min, pitch_max): - ## TODO: Figure out the detail of this function - - f0_mel_min = 1127 * np.log(1 + pitch_min / 700) - f0_mel_max = 1127 * np.log(1 + pitch_max / 700) + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) is_torch = isinstance(f0, torch.Tensor) f0_mel = 1127 * (1 + f0 / 700).log() if is_torch else 1127 * np.log(1 + f0 / 700) @@ -91,9 +70,6 @@ def get_log_f0(f0): return log_f0 -# ========== Methods ========== - - def get_f0_features_using_pyin(audio, cfg): """Using pyin to extract the f0 feature. Args: @@ -148,14 +124,7 @@ def get_f0_features_using_parselmouth(audio, cfg, speed=1): ) .selected_array["frequency"] ) - - # Pad the pitch to the mel_len - # pad_size = (int(len(audio) // hop_size) - len(f0) + 1) // 2 - # f0 = np.pad(f0, [[pad_size, mel_len - len(f0) - pad_size]], mode="constant") - - # Get the coarse part - pitch_coarse = f0_to_coarse(f0, cfg.pitch_bin, cfg.f0_min, cfg.f0_max) - return f0, pitch_coarse + return f0 def get_f0_features_using_dio(audio, cfg): @@ -260,14 +229,21 @@ def get_f0_features_using_crepe( return f0 -def get_f0(audio, cfg): +def get_f0(audio, cfg, use_interpolate=False, return_uv=False): if cfg.pitch_extractor == "dio": f0 = get_f0_features_using_dio(audio, cfg) elif cfg.pitch_extractor == "pyin": f0 = get_f0_features_using_pyin(audio, cfg) elif cfg.pitch_extractor == "parselmouth": - f0, _ = get_f0_features_using_parselmouth(audio, cfg) - # elif cfg.data.f0_extractor == 'cwt': # todo + f0 = get_f0_features_using_parselmouth(audio, cfg) + + if use_interpolate: + f0, uv = interpolate(f0) + else: + uv = f0 == 0 + + if return_uv: + return f0, uv return f0 diff --git a/utils/mel.py b/utils/mel.py index bea72787..3894b73c 100644 --- a/utils/mel.py +++ b/utils/mel.py @@ -8,6 +8,7 @@ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + # Min value: ln(1e-5) = -11.5129 return torch.log(torch.clamp(x, min=clip_val) * C) @@ -52,6 +53,9 @@ def extract_linear_features(y, cfg, center=False): def mel_spectrogram_torch(y, cfg, center=False): + """ + TODO: to merge this funtion with the extract_mel_features below + """ if torch.min(y) < -1.0: print("min value is ", torch.min(y)) if torch.max(y) > 1.0: @@ -108,7 +112,6 @@ def extract_mel_features( y, cfg, center=False, - # n_fft, n_mel, sampling_rate, hop_size, win_size, fmin, fmax, center=False ): """Extract mel features @@ -164,7 +167,6 @@ def extract_mel_features( spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec) spec = spectral_normalize_torch(spec) - return spec.squeeze(0)