From 5ceed25a859081e9f45939057021fae62c9ffb41 Mon Sep 17 00:00:00 2001
From: Liumeng Xue <33707885+lmxue@users.noreply.github.com>
Date: Thu, 14 Dec 2023 23:36:23 +0800
Subject: [PATCH] split and process valid set (#25)

Prepare Valid set for LibriTTS and LJSpeech
---
 bins/tts/preprocess.py           | 61 +++++++++-------------
 preprocessors/libritts.py        | 38 ++++++++++++--
 preprocessors/ljspeech.py        | 29 +++++++++--
 preprocessors/metadata.py        | 89 ++++++++++++++++----------------
 processors/acoustic_extractor.py |  1 -
 5 files changed, 125 insertions(+), 93 deletions(-)

diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py
index f2f4d2ed..bd8cd1ef 100644
--- a/bins/tts/preprocess.py
+++ b/bins/tts/preprocess.py
@@ -25,7 +25,7 @@
 )
 
 
-def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
+def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1):
     """Extract acoustic features of utterances in the dataset
 
     Args:
@@ -34,17 +34,9 @@ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
         cfg (dict): dictionary that stores configurations
         n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
     """
-    # types = ["train", "test"] if "eval" not in dataset else ["test"]
-    types = list()
-    types.append((cfg.preprocess.train_file).split(".")[0])
-    types.append((cfg.preprocess.valid_file).split(".")[0])
-    if "test" not in types:
-        types.append("test")
-    if "eval" in dataset:
-        types = ["test"]
-    print("types: ", types)
+
     metadata = []
-    for dataset_type in types:
+    for dataset_type in dataset_types:
         dataset_output = os.path.join(output_path, dataset)
         dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
         with open(dataset_file, "r") as f:
@@ -58,7 +50,7 @@ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
     )
 
 
-def extract_content_features(dataset, output_path, cfg, num_workers=1):
+def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1):
     """Extract content features of utterances in the dataset
 
     Args:
@@ -66,18 +58,9 @@ def extract_content_features(dataset, output_path, cfg, num_workers=1):
         output_path (str): directory that stores train, test and feature files of datasets
         cfg (dict): dictionary that stores configurations
     """
-    # types = ["train", "test"] if "eval" not in dataset else ["test"]
-
-    types = list()
-    types.append((cfg.preprocess.train_file).split(".")[0])
-    types.append((cfg.preprocess.valid_file).split(".")[0])
-    if "test" not in types:
-        types.append("test")
-    if "eval" in dataset:
-        types = ["test"]
 
     metadata = []
-    for dataset_type in types:
+    for dataset_type in dataset_types:
         dataset_output = os.path.join(output_path, dataset)
         # dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
         dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
@@ -89,7 +72,7 @@ def extract_content_features(dataset, output_path, cfg, num_workers=1):
     )
 
 
-def extract_phonme_sequences(dataset, output_path, cfg):
+def extract_phonme_sequences(dataset, output_path, cfg, dataset_types):
     """Extract phoneme features of utterances in the dataset
 
     Args:
@@ -98,18 +81,9 @@ def extract_phonme_sequences(dataset, output_path, cfg):
         cfg (dict): dictionary that stores configurations
 
     """
-    # types = ["train", "test"] if "eval" not in dataset else ["test"]
-
-    types = list()
-    types.append((cfg.preprocess.train_file).split(".")[0])
-    types.append((cfg.preprocess.valid_file).split(".")[0])
-    if "test" not in types:
-        types.append("test")
-    if "eval" in dataset:
-        types = ["test"]
 
     metadata = []
-    for dataset_type in types:
+    for dataset_type in dataset_types:
         dataset_output = os.path.join(output_path, dataset)
         dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
         with open(dataset_file, "r") as f:
@@ -162,8 +136,17 @@ def preprocess(cfg, args):
     except:
         print("No Data Augmentation.")
 
+    # json files
+    dataset_types = list()
+    dataset_types.append((cfg.preprocess.train_file).split(".")[0])
+    dataset_types.append((cfg.preprocess.valid_file).split(".")[0])
+    if "test" not in dataset_types:
+        dataset_types.append("test")
+    if "eval" in dataset:
+        dataset_types = ["test"]
+
     # Dump metadata of datasets (singers, train/test durations, etc.)
-    cal_metadata(cfg)
+    cal_metadata(cfg, dataset_types)
 
     # Prepare the acoustic features
     for dataset in cfg.dataset:
@@ -180,7 +163,9 @@ def preprocess(cfg, args):
                 dataset, args.num_workers
             )
         )
-        extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
+        extract_acoustic_features(
+            dataset, output_path, cfg, dataset_types, args.num_workers
+        )
         # Calculate the statistics of acoustic features
         if cfg.preprocess.mel_min_max_norm:
             acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
@@ -229,13 +214,15 @@ def preprocess(cfg, args):
     # Prepare the content features
     for dataset in cfg.dataset:
         print("Extracting content features for {}...".format(dataset))
-        extract_content_features(dataset, output_path, cfg, args.num_workers)
+        extract_content_features(
+            dataset, output_path, cfg, dataset_types, args.num_workers
+        )
 
     # Prepare the phenome squences
     if cfg.preprocess.extract_phone:
         for dataset in cfg.dataset:
             print("Extracting phoneme sequence for {}...".format(dataset))
-            extract_phonme_sequences(dataset, output_path, cfg)
+            extract_phonme_sequences(dataset, output_path, cfg, dataset_types)
 
 
 def main():
diff --git a/preprocessors/libritts.py b/preprocessors/libritts.py
index 090a8112..adc86f0c 100644
--- a/preprocessors/libritts.py
+++ b/preprocessors/libritts.py
@@ -63,6 +63,7 @@ def main(output_path, dataset_path):
     os.makedirs(save_dir, exist_ok=True)
     train_output_file = os.path.join(save_dir, "train.json")
     test_output_file = os.path.join(save_dir, "test.json")
+    valid_output_file = os.path.join(save_dir, "valid.json")
     singer_dict_file = os.path.join(save_dir, "singers.json")
     utt2singer_file = os.path.join(save_dir, "utt2singer")
     if has_existed(train_output_file):
@@ -79,12 +80,15 @@ def main(output_path, dataset_path):
     # We select pharases of standard spekaer as test songs
     train = []
     test = []
+    valid = []
 
     train_index_count = 0
     test_index_count = 0
+    valid_index_count = 0
 
     train_total_duration = 0
     test_total_duration = 0
+    valid_total_duration = 0
 
     for distribution, speakers2pharases2utts in tqdm(
         distribution2speakers2pharases2utts.items()
@@ -107,27 +111,49 @@ def main(output_path, dataset_path):
                     res["Path"] = os.path.join(libritts_path, res["Path"])
                     assert os.path.exists(res["Path"])
 
+                    text_file_path = os.path.join(
+                        libritts_path,
+                        distribution,
+                        speaker,
+                        chosen_pharase,
+                        chosen_uid + ".normalized.txt",
+                    )
+                    with open(text_file_path, "r") as f:
+                        lines = f.readlines()
+                        assert len(lines) == 1
+                        text = lines[0].strip()
+                        res["Text"] = text
+
                     waveform, sample_rate = torchaudio.load(res["Path"])
                     duration = waveform.size(-1) / sample_rate
                     res["Duration"] = duration
 
-                    if not "train" in distribution:
+                    if "test" in distribution:
                         res["index"] = test_index_count
                         test_total_duration += duration
                         test.append(res)
                         test_index_count += 1
-                    else:
+                    elif "train" in distribution:
                         res["index"] = train_index_count
                         train_total_duration += duration
                         train.append(res)
                         train_index_count += 1
+                    elif "dev" in distribution:
+                        res["index"] = valid_index_count
+                        valid_total_duration += duration
+                        valid.append(res)
+                        valid_index_count += 1
 
                     utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))
 
-    print("#Train = {}, #Test = {}".format(len(train), len(test)))
     print(
-        "#Train hours= {}, #Test hours= {}".format(
-            train_total_duration / 3600, test_total_duration / 3600
+        "#Train = {}, #Test = {}, #Valid = {}".format(len(train), len(test), len(valid))
+    )
+    print(
+        "#Train hours= {}, #Test hours= {}, #Valid hours= {}".format(
+            train_total_duration / 3600,
+            test_total_duration / 3600,
+            valid_total_duration / 3600,
         )
     )
 
@@ -136,6 +162,8 @@ def main(output_path, dataset_path):
         json.dump(train, f, indent=4, ensure_ascii=False)
     with open(test_output_file, "w") as f:
         json.dump(test, f, indent=4, ensure_ascii=False)
+    with open(valid_output_file, "w") as f:
+        json.dump(valid, f, indent=4, ensure_ascii=False)
 
     # Save singers.json
     singer_lut = {name: i for i, name in enumerate(unique_speakers)}
diff --git a/preprocessors/ljspeech.py b/preprocessors/ljspeech.py
index 9f4bd4ca..a7615a70 100644
--- a/preprocessors/ljspeech.py
+++ b/preprocessors/ljspeech.py
@@ -89,19 +89,26 @@ def get_uid2utt(ljspeech_path, dataset, cfg):
     return uid2utt, total_duration / 3600
 
 
-def split_dataset(lines, test_rate=0.05, test_size=None):
+def split_dataset(
+    lines, test_rate=0.05, valid_rate=0.05, test_size=None, valid_size=None
+):
     if test_size == None:
         test_size = int(len(lines) * test_rate)
+    if valid_size == None:
+        valid_size = int(len(lines) * valid_rate)
     random.shuffle(lines)
 
     train_set = []
     test_set = []
+    valid_set = []
 
     for line in lines[:test_size]:
         test_set.append(line)
-    for line in lines[test_size:]:
+    for line in lines[test_size : test_size + valid_size]:
+        valid_set.append(line)
+    for line in lines[test_size + valid_size :]:
         train_set.append(line)
-    return train_set, test_set
+    return train_set, test_set, valid_set
 
 
 max_wav_value = 32768.0
@@ -162,6 +169,7 @@ def main(output_path, dataset_path, cfg):
 
     train_output_file = os.path.join(save_dir, "train.json")
     test_output_file = os.path.join(save_dir, "test.json")
+    valid_output_file = os.path.join(save_dir, "valid.json")
     singer_dict_file = os.path.join(save_dir, "singers.json")
 
     speaker = "LJSpeech"
@@ -170,13 +178,17 @@ def main(output_path, dataset_path, cfg):
     with open(singer_dict_file, "w") as f:
         json.dump(singer_lut, f, indent=4, ensure_ascii=False)
 
-    if has_existed(train_output_file) and has_existed(test_output_file):
+    if (
+        has_existed(train_output_file)
+        and has_existed(test_output_file)
+        and has_existed(valid_output_file)
+    ):
         return
 
     meta_file = os.path.join(ljspeech_path, "metadata.csv")
     lines = get_lines(meta_file)
 
-    train_set, test_set = split_dataset(lines)
+    train_set, test_set, valid_set = split_dataset(lines)
 
     res, hours = get_uid2utt(ljspeech_path, train_set, cfg)
 
@@ -195,3 +207,10 @@ def main(output_path, dataset_path, cfg):
         json.dump(res, f, indent=4, ensure_ascii=False)
 
     print("Test_hours= {}".format(hours))
+
+    # Save valid
+    os.makedirs(save_dir, exist_ok=True)
+    with open(valid_output_file, "w") as f:
+        json.dump(res, f, indent=4, ensure_ascii=False)
+
+    print("Valid_hours= {}".format(hours))
diff --git a/preprocessors/metadata.py b/preprocessors/metadata.py
index ffa5c3e8..8411ea9a 100644
--- a/preprocessors/metadata.py
+++ b/preprocessors/metadata.py
@@ -8,7 +8,7 @@
 from tqdm import tqdm
 
 
-def cal_metadata(cfg):
+def cal_metadata(cfg, dataset_types=["train", "test"]):
     """
     Dump metadata (singers.json, meta_info.json, utt2singer) for singer dataset or multi-datasets.
     """
@@ -26,41 +26,42 @@ def cal_metadata(cfg):
         save_dir = os.path.join(cfg.preprocess.processed_dir, dataset)
         assert os.path.exists(save_dir)
 
-        # 'train.json' and 'test.json' of target dataset
-        train_metadata = os.path.join(save_dir, "train.json")
-        test_metadata = os.path.join(save_dir, "test.json")
-
-        # Sort the metadata as the duration order
-        with open(train_metadata, "r", encoding="utf-8") as f:
-            train_utterances = json.load(f)
-        with open(test_metadata, "r", encoding="utf-8") as f:
-            test_utterances = json.load(f)
-
-        train_utterances = sorted(train_utterances, key=lambda x: x["Duration"])
-        test_utterances = sorted(test_utterances, key=lambda x: x["Duration"])
-
-        # Write back the sorted metadata
-        with open(train_metadata, "w") as f:
-            json.dump(train_utterances, f, indent=4, ensure_ascii=False)
-        with open(test_metadata, "w") as f:
-            json.dump(test_utterances, f, indent=4, ensure_ascii=False)
+        # 'train.json' and 'test.json' and 'valid.json' of target dataset
+        meta_info = dict()
+        utterances_dict = dict()
+        all_utterances = list()
+        duration = dict()
+        total_duration = 0.0
+        for dataset_type in dataset_types:
+            metadata = os.path.join(save_dir, "{}.json".format(dataset_type))
+
+            # Sort the metadata as the duration order
+            with open(metadata, "r", encoding="utf-8") as f:
+                utterances = json.load(f)
+            utterances = sorted(utterances, key=lambda x: x["Duration"])
+            utterances_dict[dataset_type] = utterances
+            all_utterances.extend(utterances)
+
+            # Write back the sorted metadata
+            with open(metadata, "w") as f:
+                json.dump(utterances, f, indent=4, ensure_ascii=False)
+
+            # Get the total duration and singer names for train and test utterances
+            duration[dataset_type] = sum(utt["Duration"] for utt in utterances)
+            total_duration += duration[dataset_type]
 
         # Paths of metadata needed to be generated
         singer_dict_file = os.path.join(save_dir, cfg.preprocess.spk2id)
         utt2singer_file = os.path.join(save_dir, cfg.preprocess.utt2spk)
 
-        # Get the total duration and singer names for train and test utterances
-        train_total_duration = sum(utt["Duration"] for utt in train_utterances)
-        test_total_duration = sum(utt["Duration"] for utt in test_utterances)
-
         singer_names = set(
             f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
-            for utt in train_utterances + test_utterances
+            for utt in all_utterances
         )
 
         # Write the utt2singer file and sort the singer names
         with open(utt2singer_file, "w", encoding="utf-8") as f:
-            for utt in train_utterances + test_utterances:
+            for utt in all_utterances:
                 f.write(
                     f"{utt['Dataset']}_{utt['Uid']}\t{replace_augment_name(utt['Dataset'])}_{utt['Singer']}\n"
                 )
@@ -75,30 +76,28 @@ def cal_metadata(cfg):
         meta_info = {
             "dataset": dataset,
             "statistics": {
-                "size": len(train_utterances) + len(test_utterances),
-                "hours": round(train_total_duration / 3600, 4)
-                + round(test_total_duration / 3600, 4),
-            },
-            "train": {
-                "size": len(train_utterances),
-                "hours": round(train_total_duration / 3600, 4),
+                "size": len(all_utterances),
+                "hours": round(total_duration / 3600, 4),
             },
-            "test": {
-                "size": len(test_utterances),
-                "hours": round(test_total_duration / 3600, 4),
-            },
-            "singers": {"size": len(singer_lut)},
         }
+
+        for dataset_type in dataset_types:
+            meta_info[dataset_type] = {
+                "size": len(utterances_dict[dataset_type]),
+                "hours": round(duration[dataset_type] / 3600, 4),
+            }
+
+        meta_info["singers"] = {"size": len(singer_lut)}
+
         # Use Counter to count the minutes for each singer
         total_singer2mins = Counter()
         training_singer2mins = Counter()
-        for utt in train_utterances:
-            k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
-            training_singer2mins[k] += utt["Duration"] / 60
-            total_singer2mins[k] += utt["Duration"] / 60
-        for utt in test_utterances:
-            k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
-            total_singer2mins[k] += utt["Duration"] / 60
+        for dataset_type in dataset_types:
+            for utt in utterances_dict[dataset_type]:
+                k = f"{replace_augment_name(utt['Dataset'])}_{utt['Singer']}"
+                if dataset_type == "train":
+                    training_singer2mins[k] += utt["Duration"] / 60
+                total_singer2mins[k] += utt["Duration"] / 60
 
         training_singer2mins = dict(
             sorted(training_singer2mins.items(), key=lambda x: x[1], reverse=True)
@@ -116,7 +115,7 @@ def cal_metadata(cfg):
             json.dump(meta_info, f, indent=4, ensure_ascii=False)
 
         for singer, min in training_singer2mins.items():
-            print(f"Singer {singer}: {min} mins for training")
+            print(f"Speaker/Singer {singer}: {min} mins for training")
         print("-" * 10, "\n")
 
 
diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py
index 629c2c20..2423be59 100644
--- a/processors/acoustic_extractor.py
+++ b/processors/acoustic_extractor.py
@@ -215,7 +215,6 @@ def __extract_utt_acoustic_features(dataset_output, cfg, utt):
         if cfg.preprocess.extract_acoustic_token:
             if cfg.preprocess.acoustic_token_extractor == "Encodec":
                 codes = extract_encodec_token(wav_path)
-
                 save_feature(
                     dataset_output, cfg.preprocess.acoustic_token_dir, uid, codes
                 )