Skip to content

Issue80 #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@

import lws

# For solving issue #80
def set_hparams(param):
global hparams
hparams=param

def load_wav(path):
return librosa.core.load(path, sr=hparams.sample_rate)[0]
Expand Down
6 changes: 5 additions & 1 deletion hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@
# Use only when MemoryError continues in Windows (Disabled by default)
#gc_probability = 0.001,

# json_meta mode only
# Below are for json_meta mode only
# 0: "use all",
# 1: "ignore only unmatched_alignment",
# 2: "fully ignore recognition",
Expand All @@ -136,6 +136,10 @@
min_text=20,
# if true, data without phoneme alignment file(.lab) will be ignored
process_only_htk_aligned=False,
max_audio_length=0, # in seconds, inactive when 0. (After preprocessing)
min_audio_length=0, # in seconds, inactive when 0. (After preprocessing)
# Based on HTK-styled phoneme alignment
max_silence_length=0, # in seconds, inactive when 0,
)


Expand Down
80 changes: 66 additions & 14 deletions json_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,11 +108,11 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
if num_speakers == 1:
# Single-speaker
futures.append(executor.submit(
partial(_process_utterance_single, out_dir, text, audio_path)))
partial(_process_utterance_single, out_dir, text, audio_path, hparams=hparams)))
else:
# Multi-speaker
futures.append(executor.submit(
partial(_process_utterance, out_dir, text, audio_path, speaker_id)))
partial(_process_utterance, out_dir, text, audio_path, speaker_id, hparams=hparams)))
queue_count += 1
print(" [*] Appended {} entries in the queue".format(queue_count))

Expand Down Expand Up @@ -158,11 +158,56 @@ def end_at(labels):
for i in range(len(labels) - 2, 0, -1):
if labels[i][-1] != "pau":
return labels[i][1]
assert False

assert False

# this cleans the audio segment by -
# A. separating the whole audiosegment via segments separated with silences
# B. removing silence between each segment (while saving 'silent segment' less than hparams.max_silence_length)
def clean_by_phoneme(labels, wav, sr):
# build segment separation
silences = []
prev_end = 0
sil_start = 0
at_silence = False
for idx, content in enumerate(labels):
# silence start condition
start, end, label = content
if not at_silence:
if label == 'pau' or label == 'silB' or label == 'silE':
at_silence = True
if start != prev_end and labels[idx-1][2] != 'oov': # oov tends to have bad alignment timing
sil_start = prev_end
else:
sil_start = start
elif start != prev_end and labels[idx-1][2] != 'oov':
# one time silence
silences.append((prev_end,start))
at_silence = False
sil_start = 0
# silence end condition
else:
if label != 'pau' and label != 'silB' and label != 'silE':
silences.append((sil_start, start))
at_silence = False
sil_start = 0
prev_end = end # always
if at_silence:
silences.append((sil_start,labels[-1][1]))

# Remove silence
prev_end = 0
result_wav_list = []
for start, end in silences:
sil_end = end if end-start < hparams.max_silence_length * 1e7 else start + int(hparams.max_silence_length * 1e7)
result_wav_list.append(wav[int(prev_end*1e-7*sr):int(sil_end*1e-7*sr)])
prev_end = end
result_wav=np.concatenate(result_wav_list, axis=0)

return result_wav

def _process_utterance(out_dir, text, wav_path, speaker_id=None):

def _process_utterance(out_dir, text, wav_path, speaker_id=None, hparams=hparams):
audio.set_hparams(hparams)
# check whether singlespeaker_mode
if speaker_id is None:
return _process_utterance_single(out_dir,text,wav_path)
Expand All @@ -179,9 +224,7 @@ def _process_utterance(out_dir, text, wav_path, speaker_id=None):
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav = clean_by_phoneme(labels, wav, sr)
wav, _ = librosa.effects.trim(wav, top_db=25)
else:
if hparams.process_only_htk_aligned:
Expand All @@ -190,6 +233,11 @@ def _process_utterance(out_dir, text, wav_path, speaker_id=None):

if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max

if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length:
return None
if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length:
return None

# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
Expand All @@ -212,9 +260,10 @@ def _process_utterance(out_dir, text, wav_path, speaker_id=None):
# Return a tuple describing this training example:
return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)

def _process_utterance_single(out_dir, text, wav_path):
def _process_utterance_single(out_dir, text, wav_path, hparams=hparams):
# modified version of LJSpeech _process_utterance

audio.set_hparams(hparams)

# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
sr = hparams.sample_rate
Expand All @@ -226,9 +275,7 @@ def _process_utterance_single(out_dir, text, wav_path):
# Trim silence from hts labels if available
if exists(lab_path):
labels = hts.load(lab_path)
b = int(start_at(labels) * 1e-7 * sr)
e = int(end_at(labels) * 1e-7 * sr)
wav = wav[b:e]
wav = clean_by_phoneme(labels, wav, sr)
wav, _ = librosa.effects.trim(wav, top_db=25)
else:
if hparams.process_only_htk_aligned:
Expand All @@ -238,7 +285,12 @@ def _process_utterance_single(out_dir, text, wav_path):

if hparams.rescaling:
wav = wav / np.abs(wav).max() * hparams.rescaling_max


if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length:
return None
if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length:
return None

# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]
Expand Down
3 changes: 2 additions & 1 deletion jsut.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):

for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, wav_path, text)))
partial(_process_utterance, out_dir, index + 1, wav_path, text, hparams=hparams)))
return [future.result() for future in tqdm(futures)]


def _process_utterance(out_dir, index, wav_path, text):
audio.set_hparams(hparams)
sr = hparams.sample_rate

# Load the audio to a numpy array:
Expand Down
5 changes: 3 additions & 2 deletions ljspeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
text = parts[2]
futures.append(executor.submit(
partial(_process_utterance, out_dir, index, wav_path, text)))
partial(_process_utterance, out_dir, index, wav_path, text, hparams=hparams)))
index += 1
return [future.result() for future in tqdm(futures)]


def _process_utterance(out_dir, index, wav_path, text):
def _process_utterance(out_dir, index, wav_path, text, hparams=hparams):
'''Preprocesses a single utterance audio/text pair.

This writes the mel and linear scale spectrograms to disk and returns a tuple to write
Expand All @@ -50,6 +50,7 @@ def _process_utterance(out_dir, index, wav_path, text):
Returns:
A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
'''
audio.set_hparams(hparams)

# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
Expand Down
7 changes: 4 additions & 3 deletions nikl_m.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
uid = re.search(r'([a-z][a-z][0-9][0-9]_t)', wav_path)
uid = uid.group(1).replace('_t', '')
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, spk_id[uid], wav_path, text)))
partial(_process_utterance, out_dir, index + 1, spk_id[uid], wav_path, text, hparams=hparams)))
index += 1
return [future.result() for future in tqdm(futures)]


def _process_utterance(out_dir, index, speaker_id, wav_path, text):
def _process_utterance(out_dir, index, speaker_id, wav_path, text, hparams):
'''Preprocesses a single utterance audio/text pair.

This writes the mel and linear scale spectrograms to disk and returns a tuple to write
Expand All @@ -64,7 +64,8 @@ def _process_utterance(out_dir, index, speaker_id, wav_path, text):
Returns:
A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
'''

audio.set_hparams(hparams)

# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)

Expand Down
5 changes: 3 additions & 2 deletions nikl_s.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
wav_path = parts[0]
text = parts[1]
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, wav_path, text)))
partial(_process_utterance, out_dir, index + 1, wav_path, text, hparams=hparams)))
index += 1
return [future.result() for future in tqdm(futures)]


def _process_utterance(out_dir, index, wav_path, text):
def _process_utterance(out_dir, index, wav_path, text, hparams=hparams):
'''Preprocesses a single utterance audio/text pair.

This writes the mel and linear scale spectrograms to disk and returns a tuple to write
Expand All @@ -61,6 +61,7 @@ def _process_utterance(out_dir, index, wav_path, text):
Returns:
A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
'''
audio.set_hparams(hparams)

# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
Expand Down
5 changes: 3 additions & 2 deletions vctk.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
for index, (speaker_id, text, wav_path) in enumerate(
zip(speaker_ids, transcriptions, wav_paths)):
futures.append(executor.submit(
partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text)))
partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text, hparams=hparams)))
return [future.result() for future in tqdm(futures)]


Expand All @@ -49,8 +49,9 @@ def end_at(labels):
assert False


def _process_utterance(out_dir, index, speaker_id, wav_path, text):
def _process_utterance(out_dir, index, speaker_id, wav_path, text, hparams=hparams):
sr = hparams.sample_rate
audio.set_hparams(hparams)

# Load the audio to a numpy array:
wav = audio.load_wav(wav_path)
Expand Down