Skip to content

Commit

Permalink
split and process valid set (#25)
Browse files Browse the repository at this point in the history
Prepare Valid set for LibriTTS and LJSpeech
  • Loading branch information
lmxue committed Dec 14, 2023
1 parent a6e147c commit 5ceed25
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 93 deletions.
61 changes: 24 additions & 37 deletions bins/tts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
)


def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
def extract_acoustic_features(dataset, output_path, cfg, dataset_types, n_workers=1):
"""Extract acoustic features of utterances in the dataset
Args:
Expand All @@ -34,17 +34,9 @@ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
cfg (dict): dictionary that stores configurations
n_workers (int, optional): num of processes to extract features in parallel. Defaults to 1.
"""
# types = ["train", "test"] if "eval" not in dataset else ["test"]
types = list()
types.append((cfg.preprocess.train_file).split(".")[0])
types.append((cfg.preprocess.valid_file).split(".")[0])
if "test" not in types:
types.append("test")
if "eval" in dataset:
types = ["test"]
print("types: ", types)

metadata = []
for dataset_type in types:
for dataset_type in dataset_types:
dataset_output = os.path.join(output_path, dataset)
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
with open(dataset_file, "r") as f:
Expand All @@ -58,26 +50,17 @@ def extract_acoustic_features(dataset, output_path, cfg, n_workers=1):
)


def extract_content_features(dataset, output_path, cfg, num_workers=1):
def extract_content_features(dataset, output_path, cfg, dataset_types, num_workers=1):
"""Extract content features of utterances in the dataset
Args:
dataset (str): name of dataset, e.g. opencpop
output_path (str): directory that stores train, test and feature files of datasets
cfg (dict): dictionary that stores configurations
"""
# types = ["train", "test"] if "eval" not in dataset else ["test"]

types = list()
types.append((cfg.preprocess.train_file).split(".")[0])
types.append((cfg.preprocess.valid_file).split(".")[0])
if "test" not in types:
types.append("test")
if "eval" in dataset:
types = ["test"]

metadata = []
for dataset_type in types:
for dataset_type in dataset_types:
dataset_output = os.path.join(output_path, dataset)
# dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
Expand All @@ -89,7 +72,7 @@ def extract_content_features(dataset, output_path, cfg, num_workers=1):
)


def extract_phonme_sequences(dataset, output_path, cfg):
def extract_phonme_sequences(dataset, output_path, cfg, dataset_types):
"""Extract phoneme features of utterances in the dataset
Args:
Expand All @@ -98,18 +81,9 @@ def extract_phonme_sequences(dataset, output_path, cfg):
cfg (dict): dictionary that stores configurations
"""
# types = ["train", "test"] if "eval" not in dataset else ["test"]

types = list()
types.append((cfg.preprocess.train_file).split(".")[0])
types.append((cfg.preprocess.valid_file).split(".")[0])
if "test" not in types:
types.append("test")
if "eval" in dataset:
types = ["test"]

metadata = []
for dataset_type in types:
for dataset_type in dataset_types:
dataset_output = os.path.join(output_path, dataset)
dataset_file = os.path.join(dataset_output, "{}.json".format(dataset_type))
with open(dataset_file, "r") as f:
Expand Down Expand Up @@ -162,8 +136,17 @@ def preprocess(cfg, args):
except:
print("No Data Augmentation.")

# json files
dataset_types = list()
dataset_types.append((cfg.preprocess.train_file).split(".")[0])
dataset_types.append((cfg.preprocess.valid_file).split(".")[0])
if "test" not in dataset_types:
dataset_types.append("test")
if "eval" in dataset:
dataset_types = ["test"]

# Dump metadata of datasets (singers, train/test durations, etc.)
cal_metadata(cfg)
cal_metadata(cfg, dataset_types)

# Prepare the acoustic features
for dataset in cfg.dataset:
Expand All @@ -180,7 +163,9 @@ def preprocess(cfg, args):
dataset, args.num_workers
)
)
extract_acoustic_features(dataset, output_path, cfg, args.num_workers)
extract_acoustic_features(
dataset, output_path, cfg, dataset_types, args.num_workers
)
# Calculate the statistics of acoustic features
if cfg.preprocess.mel_min_max_norm:
acoustic_extractor.cal_mel_min_max(dataset, output_path, cfg)
Expand Down Expand Up @@ -229,13 +214,15 @@ def preprocess(cfg, args):
# Prepare the content features
for dataset in cfg.dataset:
print("Extracting content features for {}...".format(dataset))
extract_content_features(dataset, output_path, cfg, args.num_workers)
extract_content_features(
dataset, output_path, cfg, dataset_types, args.num_workers
)

# Prepare the phenome squences
if cfg.preprocess.extract_phone:
for dataset in cfg.dataset:
print("Extracting phoneme sequence for {}...".format(dataset))
extract_phonme_sequences(dataset, output_path, cfg)
extract_phonme_sequences(dataset, output_path, cfg, dataset_types)


def main():
Expand Down
38 changes: 33 additions & 5 deletions preprocessors/libritts.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def main(output_path, dataset_path):
os.makedirs(save_dir, exist_ok=True)
train_output_file = os.path.join(save_dir, "train.json")
test_output_file = os.path.join(save_dir, "test.json")
valid_output_file = os.path.join(save_dir, "valid.json")
singer_dict_file = os.path.join(save_dir, "singers.json")
utt2singer_file = os.path.join(save_dir, "utt2singer")
if has_existed(train_output_file):
Expand All @@ -79,12 +80,15 @@ def main(output_path, dataset_path):
# We select pharases of standard spekaer as test songs
train = []
test = []
valid = []

train_index_count = 0
test_index_count = 0
valid_index_count = 0

train_total_duration = 0
test_total_duration = 0
valid_total_duration = 0

for distribution, speakers2pharases2utts in tqdm(
distribution2speakers2pharases2utts.items()
Expand All @@ -107,27 +111,49 @@ def main(output_path, dataset_path):
res["Path"] = os.path.join(libritts_path, res["Path"])
assert os.path.exists(res["Path"])

text_file_path = os.path.join(
libritts_path,
distribution,
speaker,
chosen_pharase,
chosen_uid + ".normalized.txt",
)
with open(text_file_path, "r") as f:
lines = f.readlines()
assert len(lines) == 1
text = lines[0].strip()
res["Text"] = text

waveform, sample_rate = torchaudio.load(res["Path"])
duration = waveform.size(-1) / sample_rate
res["Duration"] = duration

if not "train" in distribution:
if "test" in distribution:
res["index"] = test_index_count
test_total_duration += duration
test.append(res)
test_index_count += 1
else:
elif "train" in distribution:
res["index"] = train_index_count
train_total_duration += duration
train.append(res)
train_index_count += 1
elif "dev" in distribution:
res["index"] = valid_index_count
valid_total_duration += duration
valid.append(res)
valid_index_count += 1

utt2singer.write("{}\t{}\n".format(res["Uid"], res["Singer"]))

print("#Train = {}, #Test = {}".format(len(train), len(test)))
print(
"#Train hours= {}, #Test hours= {}".format(
train_total_duration / 3600, test_total_duration / 3600
"#Train = {}, #Test = {}, #Valid = {}".format(len(train), len(test), len(valid))
)
print(
"#Train hours= {}, #Test hours= {}, #Valid hours= {}".format(
train_total_duration / 3600,
test_total_duration / 3600,
valid_total_duration / 3600,
)
)

Expand All @@ -136,6 +162,8 @@ def main(output_path, dataset_path):
json.dump(train, f, indent=4, ensure_ascii=False)
with open(test_output_file, "w") as f:
json.dump(test, f, indent=4, ensure_ascii=False)
with open(valid_output_file, "w") as f:
json.dump(valid, f, indent=4, ensure_ascii=False)

# Save singers.json
singer_lut = {name: i for i, name in enumerate(unique_speakers)}
Expand Down
29 changes: 24 additions & 5 deletions preprocessors/ljspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,19 +89,26 @@ def get_uid2utt(ljspeech_path, dataset, cfg):
return uid2utt, total_duration / 3600


def split_dataset(lines, test_rate=0.05, test_size=None):
def split_dataset(
lines, test_rate=0.05, valid_rate=0.05, test_size=None, valid_size=None
):
if test_size == None:
test_size = int(len(lines) * test_rate)
if valid_size == None:
valid_size = int(len(lines) * valid_rate)
random.shuffle(lines)

train_set = []
test_set = []
valid_set = []

for line in lines[:test_size]:
test_set.append(line)
for line in lines[test_size:]:
for line in lines[test_size : test_size + valid_size]:
valid_set.append(line)
for line in lines[test_size + valid_size :]:
train_set.append(line)
return train_set, test_set
return train_set, test_set, valid_set


max_wav_value = 32768.0
Expand Down Expand Up @@ -162,6 +169,7 @@ def main(output_path, dataset_path, cfg):

train_output_file = os.path.join(save_dir, "train.json")
test_output_file = os.path.join(save_dir, "test.json")
valid_output_file = os.path.join(save_dir, "valid.json")
singer_dict_file = os.path.join(save_dir, "singers.json")

speaker = "LJSpeech"
Expand All @@ -170,13 +178,17 @@ def main(output_path, dataset_path, cfg):
with open(singer_dict_file, "w") as f:
json.dump(singer_lut, f, indent=4, ensure_ascii=False)

if has_existed(train_output_file) and has_existed(test_output_file):
if (
has_existed(train_output_file)
and has_existed(test_output_file)
and has_existed(valid_output_file)
):
return

meta_file = os.path.join(ljspeech_path, "metadata.csv")
lines = get_lines(meta_file)

train_set, test_set = split_dataset(lines)
train_set, test_set, valid_set = split_dataset(lines)

res, hours = get_uid2utt(ljspeech_path, train_set, cfg)

Expand All @@ -195,3 +207,10 @@ def main(output_path, dataset_path, cfg):
json.dump(res, f, indent=4, ensure_ascii=False)

print("Test_hours= {}".format(hours))

# Save valid
os.makedirs(save_dir, exist_ok=True)
with open(valid_output_file, "w") as f:
json.dump(res, f, indent=4, ensure_ascii=False)

print("Valid_hours= {}".format(hours))
Loading

0 comments on commit 5ceed25

Please sign in to comment.