preprocessors/svcceval.py

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import glob
import librosa
import json

from utils.util import has_existed


def main(output_path, dataset_path):
    print("-" * 10)
    print("Preparing training dataset for svcceval...")

    data_dir = os.path.join(dataset_path, "Data")
    save_dir = os.path.join(output_path, "svcceval")
    os.makedirs(save_dir, exist_ok=True)

    singer_dict_file = os.path.join(save_dir, "singers.json")
    utt2singer_file = os.path.join(save_dir, "utt2singer")
    utt2singer = open(utt2singer_file, "w")

    # Load utterances
    train = []
    test = []
    singers = []
    for wav_file in glob.glob(os.path.join(data_dir, "*/*.wav")):
        singer, filename = wav_file.split("/")[-2:]
        uid = filename.split(".")[0]
        utt = {
            "Dataset": "svcceval",
            "Singer": singer,
            "Uid": "{}_{}".format(singer, uid),
            "Path": wav_file,
        }

        # Duration
        duration = librosa.get_duration(filename=wav_file)
        utt["Duration"] = duration

        test.append(utt)

        singers.append(singer)
        utt2singer.write("{}\t{}\n".format(utt["Uid"], utt["Singer"]))

    # Save singers.json
    unique_singers = list(set(singers))
    unique_singers.sort()
    singer_lut = {name: i for i, name in enumerate(unique_singers)}
    with open(singer_dict_file, "w") as f:
        json.dump(singer_lut, f, indent=4, ensure_ascii=False)

    train_total_duration = sum([utt["Duration"] for utt in train])
    test_total_duration = sum([utt["Duration"] for utt in test])

    for dataset_type in ["train", "test"]:
        output_file = os.path.join(save_dir, "{}.json".format(dataset_type))
        if has_existed(output_file):
            continue

        utterances = eval(dataset_type)
        utterances = sorted(utterances, key=lambda x: x["Uid"])

        for i in range(len(utterances)):
            utterances[i]["index"] = i

        print("{}: Total size: {}\n".format(dataset_type, len(utterances)))

        # Save
        with open(output_file, "w") as f:
            json.dump(utterances, f, indent=4, ensure_ascii=False)

    print(
        "#Train hours= {}, #Test hours= {}".format(
            train_total_duration / 3600, test_total_duration / 3600
        )
    )