From 5ceed25a859081e9f45939057021fae62c9ffb41 Mon Sep 17 00:00:00 2001 From: Liumeng Xue <33707885+lmxue@users.noreply.github.com> Date: Thu, 14 Dec 2023 23:36:23 +0800 Subject: [PATCH] split and process valid set (#25) Prepare Valid set for LibriTTS and LJSpeech --- bins/tts/preprocess.py | 61 +++++++++------------- preprocessors/libritts.py | 38 ++++++++++++-- preprocessors/ljspeech.py | 29 +++++++++-- preprocessors/metadata.py | 89 ++++++++++++++++---------------- processors/acoustic_extractor.py | 1 - 5 files changed, 125 insertions(+), 93 deletions(-) diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py index f2f4d2ed..bd8cd1ef 100644 --- a/bins/tts/preprocess.py +++ b/bins/tts/preprocess.py @@ -25,7 +25,7 @@ ) -def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): +def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1): """Extract acoustic features of utterances in the dataset Args: @@ -34,17 +34,9 @@ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): cfg (dict): dictionary that stores configurations n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1. """ - # types = ["train", "test"] if "eval" not in dataset else ["test"] - types = list() - types.append((cfg.preprocess.train_file).split(".")[0]) - types.append((cfg.preprocess.valid_file).split(".")[0]) - if "test" not in types: - types.append("test") - if "eval" in dataset: - types = ["test"] - print("types: ", types) + metadata = [] - for dataset_type in types: + for dataset_type in dataset_types: dataset_output = os.path.join(output_path, dataset) dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) with open(dataset_file, "r") as f: @@ -58,7 +50,7 @@ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1): ) -def extract_content_features(dataset, output_path, cfg, num_workers=1): +def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1): """Extract content features of utterances in the dataset Args: @@ -66,18 +58,9 @@ def extract_content_features(dataset, output_path, cfg, num_workers=1): output_path (str): directory that stores train, test and feature files of datasets cfg (dict): dictionary that stores configurations """ - # types = ["train", "test"] if "eval" not in dataset else ["test"] - - types = list() - types.append((cfg.preprocess.train_file).split(".")[0]) - types.append((cfg.preprocess.valid_file).split(".")[0]) - if "test" not in types: - types.append("test") - if "eval" in dataset: - types = ["test"] metadata = [] - for dataset_type in types: + for dataset_type in dataset_types: dataset_output = os.path.join(output_path, dataset) # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) @@ -89,7 +72,7 @@ def extract_content_features(dataset, output_path, cfg, num_workers=1): ) -def extract_phonme_sequences(dataset, output_path, cfg): +def extract_phonme_sequences(dataset, output_path, cfg, dataset_types): """Extract phoneme features of utterances in the dataset Args: @@ -98,18 +81,9 @@ def extract_phonme_sequences(dataset, output_path, cfg): cfg (dict): dictionary that stores configurations """ - # types = ["train", "test"] if "eval" not in dataset else ["test"] - - types = list() - types.append((cfg.preprocess.train_file).split(".")[0]) - types.append((cfg.preprocess.valid_file).split(".")[0]) - if "test" not in types: - types.append("test") - if "eval" in dataset: - types = ["test"] metadata = [] - for dataset_type in types: + for dataset_type in dataset_types: dataset_output = os.path.join(output_path, dataset) dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type)) with open(dataset_file, "r") as f: @@ -162,8 +136,17 @@ def preprocess(cfg, args): except: print("No Data Augmentation.") + # json files + dataset_types = list() + dataset_types.append((cfg.preprocess.train_file).split(".")[0]) + dataset_types.append((cfg.preprocess.valid_file).split(".")[0]) + if "test" not in dataset_types: + dataset_types.append("test") + if "eval" in dataset: + dataset_types = ["test"] + # Dump metadata of datasets (singers, train/test durations, etc.) - cal_metadata(cfg) + cal_metadata(cfg, dataset_types) # Prepare the acoustic features for dataset in cfg.dataset: @@ -180,7 +163,9 @@ def preprocess(cfg, args): dataset, args.num_workers ) ) - extract_acoustic_features(dataset, output_path, cfg, args.num_workers) + extract_acoustic_features( + dataset, output_path, cfg, dataset_types, args.num_workers + ) # Calculate the statistics of acoustic features if cfg.preprocess.mel_min_max_norm: acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg) @@ -229,13 +214,15 @@ def preprocess(cfg, args): # Prepare the content features for dataset in cfg.dataset: print("Extracting content features for {}...".format(dataset)) - extract_content_features(dataset, output_path, cfg, args.num_workers) + extract_content_features( + dataset, output_path, cfg, dataset_types, args.num_workers + ) # Prepare the phenome squences if cfg.preprocess.extract_phone: for dataset in cfg.dataset: print("Extracting phoneme sequence for {}...".format(dataset)) - extract_phonme_sequences(dataset, output_path, cfg) + extract_phonme_sequences(dataset, output_path, cfg, dataset_types) def main(): diff --git a/preprocessors/libritts.py b/preprocessors/libritts.py index 090a8112..adc86f0c 100644 --- a/preprocessors/libritts.py +++ b/preprocessors/libritts.py @@ -63,6 +63,7 @@ def main(output_path, dataset_path): os.makedirs(save_dir, exist_ok=True) train_output_file = os.path.join(save_dir, "train.json") test_output_file = os.path.join(save_dir, "test.json") + valid_output_file = os.path.join(save_dir, "valid.json") singer_dict_file = os.path.join(save_dir, "singers.json") utt2singer_file = os.path.join(save_dir, "utt2singer") if has_existed(train_output_file): @@ -79,12 +80,15 @@ def main(output_path, dataset_path): # We select pharases of standard spekaer as test songs train = [] test = [] + valid = [] train_index_count = 0 test_index_count = 0 + valid_index_count = 0 train_total_duration = 0 test_total_duration = 0 + valid_total_duration = 0 for distribution, speakers2pharases2utts in tqdm( distribution2speakers2pharases2utts.items() @@ -107,27 +111,49 @@ def main(output_path, dataset_path): res["Path"] = os.path.join(libritts_path, res["Path"]) assert os.path.exists(res["Path"]) + text_file_path = os.path.join( + libritts_path, + distribution, + speaker, + chosen_pharase, + chosen_uid + ".normalized.txt", + ) + with open(text_file_path, "r") as f: + lines = f.readlines() + assert len(lines) == 1 + text = lines[0].strip() + res["Text"] = text + waveform, sample_rate = torchaudio.load(res["Path"]) duration = waveform.size(-1) / sample_rate res["Duration"] = duration - if not "train" in distribution: + if "test" in distribution: res["index"] = test_index_count test_total_duration += duration test.append(res) test_index_count += 1 - else: + elif "train" in distribution: res["index"] = train_index_count train_total_duration += duration train.append(res) train_index_count += 1 + elif "dev" in distribution: + res["index"] = valid_index_count + valid_total_duration += duration + valid.append(res) + valid_index_count += 1 utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"])) - print("#Train = {}, #Test = {}".format(len(train), len(test))) print( - "#Train hours= {}, #Test hours= {}".format( - train_total_duration / 3600, test_total_duration / 3600 + "#Train = {}, #Test = {}, #Valid = {}".format(len(train), len(test), len(valid)) + ) + print( + "#Train hours= {}, #Test hours= {}, #Valid hours= {}".format( + train_total_duration / 3600, + test_total_duration / 3600, + valid_total_duration / 3600, ) ) @@ -136,6 +162,8 @@ def main(output_path, dataset_path): json.dump(train, f, indent=4, ensure_ascii=False) with open(test_output_file, "w") as f: json.dump(test, f, indent=4, ensure_ascii=False) + with open(valid_output_file, "w") as f: + json.dump(valid, f, indent=4, ensure_ascii=False) # Save singers.json singer_lut = {name: i for i, name in enumerate(unique_speakers)} diff --git a/preprocessors/ljspeech.py b/preprocessors/ljspeech.py index 9f4bd4ca..a7615a70 100644 --- a/preprocessors/ljspeech.py +++ b/preprocessors/ljspeech.py @@ -89,19 +89,26 @@ def get_uid2utt(ljspeech_path, dataset, cfg): return uid2utt, total_duration / 3600 -def split_dataset(lines, test_rate=0.05, test_size=None): +def split_dataset( + lines, test_rate=0.05, valid_rate=0.05, test_size=None, valid_size=None +): if test_size == None: test_size = int(len(lines) * test_rate) + if valid_size == None: + valid_size = int(len(lines) * valid_rate) random.shuffle(lines) train_set = [] test_set = [] + valid_set = [] for line in lines[:test_size]: test_set.append(line) - for line in lines[test_size:]: + for line in lines[test_size : test_size + valid_size]: + valid_set.append(line) + for line in lines[test_size + valid_size :]: train_set.append(line) - return train_set, test_set + return train_set, test_set, valid_set max_wav_value = 32768.0 @@ -162,6 +169,7 @@ def main(output_path, dataset_path, cfg): train_output_file = os.path.join(save_dir, "train.json") test_output_file = os.path.join(save_dir, "test.json") + valid_output_file = os.path.join(save_dir, "valid.json") singer_dict_file = os.path.join(save_dir, "singers.json") speaker = "LJSpeech" @@ -170,13 +178,17 @@ def main(output_path, dataset_path, cfg): with open(singer_dict_file, "w") as f: json.dump(singer_lut, f, indent=4, ensure_ascii=False) - if has_existed(train_output_file) and has_existed(test_output_file): + if ( + has_existed(train_output_file) + and has_existed(test_output_file) + and has_existed(valid_output_file) + ): return meta_file = os.path.join(ljspeech_path, "metadata.csv") lines = get_lines(meta_file) - train_set, test_set = split_dataset(lines) + train_set, test_set, valid_set = split_dataset(lines) res, hours = get_uid2utt(ljspeech_path, train_set, cfg) @@ -195,3 +207,10 @@ def main(output_path, dataset_path, cfg): json.dump(res, f, indent=4, ensure_ascii=False) print("Test_hours= {}".format(hours)) + + # Save valid + os.makedirs(save_dir, exist_ok=True) + with open(valid_output_file, "w") as f: + json.dump(res, f, indent=4, ensure_ascii=False) + + print("Valid_hours= {}".format(hours)) diff --git a/preprocessors/metadata.py b/preprocessors/metadata.py index ffa5c3e8..8411ea9a 100644 --- a/preprocessors/metadata.py +++ b/preprocessors/metadata.py @@ -8,7 +8,7 @@ from tqdm import tqdm -def cal_metadata(cfg): +def cal_metadata(cfg, dataset_types=["train", "test"]): """ Dump metadata (singers.json, meta_info.json, utt2singer) for singer dataset or multi-datasets. """ @@ -26,41 +26,42 @@ def cal_metadata(cfg): save_dir = os.path.join(cfg.preprocess.processed_dir, dataset) assert os.path.exists(save_dir) - # 'train.json' and 'test.json' of target dataset - train_metadata = os.path.join(save_dir, "train.json") - test_metadata = os.path.join(save_dir, "test.json") - - # Sort the metadata as the duration order - with open(train_metadata, "r", encoding="utf-8") as f: - train_utterances = json.load(f) - with open(test_metadata, "r", encoding="utf-8") as f: - test_utterances = json.load(f) - - train_utterances = sorted(train_utterances, key=lambda x: x["Duration"]) - test_utterances = sorted(test_utterances, key=lambda x: x["Duration"]) - - # Write back the sorted metadata - with open(train_metadata, "w") as f: - json.dump(train_utterances, f, indent=4, ensure_ascii=False) - with open(test_metadata, "w") as f: - json.dump(test_utterances, f, indent=4, ensure_ascii=False) + # 'train.json' and 'test.json' and 'valid.json' of target dataset + meta_info = dict() + utterances_dict = dict() + all_utterances = list() + duration = dict() + total_duration = 0.0 + for dataset_type in dataset_types: + metadata = os.path.join(save_dir, "{}.json".format(dataset_type)) + + # Sort the metadata as the duration order + with open(metadata, "r", encoding="utf-8") as f: + utterances = json.load(f) + utterances = sorted(utterances, key=lambda x: x["Duration"]) + utterances_dict[dataset_type] = utterances + all_utterances.extend(utterances) + + # Write back the sorted metadata + with open(metadata, "w") as f: + json.dump(utterances, f, indent=4, ensure_ascii=False) + + # Get the total duration and singer names for train and test utterances + duration[dataset_type] = sum(utt["Duration"] for utt in utterances) + total_duration += duration[dataset_type] # Paths of metadata needed to be generated singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id) utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk) - # Get the total duration and singer names for train and test utterances - train_total_duration = sum(utt["Duration"] for utt in train_utterances) - test_total_duration = sum(utt["Duration"] for utt in test_utterances) - singer_names = set( f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}" - for utt in train_utterances + test_utterances + for utt in all_utterances ) # Write the utt2singer file and sort the singer names with open(utt2singer_file, "w", encoding="utf-8") as f: - for utt in train_utterances + test_utterances: + for utt in all_utterances: f.write( f"{utt['Dataset']}_{utt['Uid']}\t{replace_augment_name(utt['Dataset'])}_{utt['Singer']}\n" ) @@ -75,30 +76,28 @@ def cal_metadata(cfg): meta_info = { "dataset": dataset, "statistics": { - "size": len(train_utterances) + len(test_utterances), - "hours": round(train_total_duration / 3600, 4) - + round(test_total_duration / 3600, 4), - }, - "train": { - "size": len(train_utterances), - "hours": round(train_total_duration / 3600, 4), + "size": len(all_utterances), + "hours": round(total_duration / 3600, 4), }, - "test": { - "size": len(test_utterances), - "hours": round(test_total_duration / 3600, 4), - }, - "singers": {"size": len(singer_lut)}, } + + for dataset_type in dataset_types: + meta_info[dataset_type] = { + "size": len(utterances_dict[dataset_type]), + "hours": round(duration[dataset_type] / 3600, 4), + } + + meta_info["singers"] = {"size": len(singer_lut)} + # Use Counter to count the minutes for each singer total_singer2mins = Counter() training_singer2mins = Counter() - for utt in train_utterances: - k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}" - training_singer2mins[k] += utt["Duration"] / 60 - total_singer2mins[k] += utt["Duration"] / 60 - for utt in test_utterances: - k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}" - total_singer2mins[k] += utt["Duration"] / 60 + for dataset_type in dataset_types: + for utt in utterances_dict[dataset_type]: + k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}" + if dataset_type == "train": + training_singer2mins[k] += utt["Duration"] / 60 + total_singer2mins[k] += utt["Duration"] / 60 training_singer2mins = dict( sorted(training_singer2mins.items(), key=lambda x: x[1], reverse=True) @@ -116,7 +115,7 @@ def cal_metadata(cfg): json.dump(meta_info, f, indent=4, ensure_ascii=False) for singer, min in training_singer2mins.items(): - print(f"Singer {singer}: {min} mins for training") + print(f"Speaker/Singer {singer}: {min} mins for training") print("-" * 10, "\n") diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py index 629c2c20..2423be59 100644 --- a/processors/acoustic_extractor.py +++ b/processors/acoustic_extractor.py @@ -215,7 +215,6 @@ def __extract_utt_acoustic_features(dataset_output, cfg, utt): if cfg.preprocess.extract_acoustic_token: if cfg.preprocess.acoustic_token_extractor == "Encodec": codes = extract_encodec_token(wav_path) - save_feature( dataset_output, cfg.preprocess.acoustic_token_dir, uid, codes )