engiecat · engiecat · May 12, 2018 · May 6, 2018 · May 10, 2018 · May 10, 2018
diff --git a/audio.py b/audio.py
@@ -8,6 +8,10 @@
 
 import lws
 
+# For solving issue #80
+def set_hparams(param):
+    global hparams
+    hparams=param
 
 def load_wav(path):
     return librosa.core.load(path, sr=hparams.sample_rate)[0]

diff --git a/hparams.py b/hparams.py
@@ -127,7 +127,7 @@
     # Use only when MemoryError continues in Windows (Disabled by default)
     #gc_probability = 0.001,
 
-    # json_meta mode only
+    # Below are for json_meta mode only
     # 0: "use all",
     # 1: "ignore only unmatched_alignment",
     # 2: "fully ignore recognition",
@@ -136,6 +136,10 @@
     min_text=20,
     # if true, data without phoneme alignment file(.lab) will be ignored
     process_only_htk_aligned=False,
+    max_audio_length=0, # in seconds, inactive when 0. (After preprocessing)
+    min_audio_length=0, # in seconds, inactive when 0. (After preprocessing)
+    # Based on HTK-styled phoneme alignment
+    max_silence_length=0, # in seconds, inactive when 0, 
 )
 
 

diff --git a/json_meta.py b/json_meta.py
@@ -108,11 +108,11 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
             if num_speakers == 1:
                 # Single-speaker
                 futures.append(executor.submit(
-                    partial(_process_utterance_single, out_dir, text, audio_path)))
+                    partial(_process_utterance_single, out_dir, text, audio_path, hparams=hparams)))
             else:
                 # Multi-speaker
                 futures.append(executor.submit(
-                    partial(_process_utterance, out_dir, text, audio_path, speaker_id)))
+                    partial(_process_utterance, out_dir, text, audio_path, speaker_id, hparams=hparams)))
             queue_count += 1
         print(" [*] Appended {} entries in the queue".format(queue_count))
 
@@ -158,11 +158,56 @@ def end_at(labels):
     for i in range(len(labels) - 2, 0, -1):
         if labels[i][-1] != "pau":
             return labels[i][1]
-    assert False
-
+    assert False    
+
+# this cleans the audio segment by - 
+# A. separating the whole audiosegment via segments separated with silences 
+# B. removing silence between each segment (while saving 'silent segment' less than hparams.max_silence_length)
+def clean_by_phoneme(labels, wav, sr):
+    # build segment separation
+    silences = []
+    prev_end = 0
+    sil_start = 0
+    at_silence = False
+    for idx, content in enumerate(labels):
+        # silence start condition
+        start, end, label = content
+        if not at_silence:
+            if label == 'pau' or label == 'silB' or label == 'silE':
+                at_silence = True
+                if start != prev_end and labels[idx-1][2] != 'oov': # oov tends to have bad alignment timing
+                    sil_start = prev_end
+                else:
+                    sil_start = start
+            elif start != prev_end and labels[idx-1][2] != 'oov':
+                # one time silence
+                silences.append((prev_end,start))
+                at_silence = False
+                sil_start = 0
+        # silence end condition
+        else:
+            if label != 'pau' and label != 'silB' and label != 'silE':
+                silences.append((sil_start, start))
+                at_silence = False
+                sil_start = 0
+        prev_end = end # always
+    if at_silence:
+        silences.append((sil_start,labels[-1][1]))
+
+    # Remove silence
+    prev_end = 0
+    result_wav_list = []
+    for start, end in silences:
+        sil_end = end if end-start < hparams.max_silence_length * 1e7 else start + int(hparams.max_silence_length * 1e7)
+        result_wav_list.append(wav[int(prev_end*1e-7*sr):int(sil_end*1e-7*sr)])
+        prev_end = end
+    result_wav=np.concatenate(result_wav_list, axis=0)
+
+    return result_wav
 
-def _process_utterance(out_dir, text, wav_path, speaker_id=None):
 
+def _process_utterance(out_dir, text, wav_path, speaker_id=None, hparams=hparams):
+    audio.set_hparams(hparams)
     # check whether singlespeaker_mode
     if speaker_id is None:
         return _process_utterance_single(out_dir,text,wav_path)
@@ -179,9 +224,7 @@ def _process_utterance(out_dir, text, wav_path, speaker_id=None):
     # Trim silence from hts labels if available
     if exists(lab_path):
         labels = hts.load(lab_path)
-        b = int(start_at(labels) * 1e-7 * sr)
-        e = int(end_at(labels) * 1e-7 * sr)
-        wav = wav[b:e]
+        wav = clean_by_phoneme(labels, wav, sr)
         wav, _ = librosa.effects.trim(wav, top_db=25)
     else:
         if hparams.process_only_htk_aligned:
@@ -190,6 +233,11 @@ def _process_utterance(out_dir, text, wav_path, speaker_id=None):
 
     if hparams.rescaling:
         wav = wav / np.abs(wav).max() * hparams.rescaling_max
+
+    if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length:
+        return None
+    if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length:
+        return None
 
     # Compute the linear-scale spectrogram from the wav:
     spectrogram = audio.spectrogram(wav).astype(np.float32)
@@ -212,9 +260,10 @@ def _process_utterance(out_dir, text, wav_path, speaker_id=None):
     # Return a tuple describing this training example:
     return (spectrogram_filename, mel_filename, n_frames, text, speaker_id)
 
-def _process_utterance_single(out_dir, text, wav_path):
+def _process_utterance_single(out_dir, text, wav_path, hparams=hparams):
     # modified version of LJSpeech _process_utterance
-
+    audio.set_hparams(hparams)
+
     # Load the audio to a numpy array:
     wav = audio.load_wav(wav_path)
     sr = hparams.sample_rate
@@ -226,9 +275,7 @@ def _process_utterance_single(out_dir, text, wav_path):
     # Trim silence from hts labels if available
     if exists(lab_path):
         labels = hts.load(lab_path)
-        b = int(start_at(labels) * 1e-7 * sr)
-        e = int(end_at(labels) * 1e-7 * sr)
-        wav = wav[b:e]
+        wav = clean_by_phoneme(labels, wav, sr)
         wav, _ = librosa.effects.trim(wav, top_db=25)
     else:
         if hparams.process_only_htk_aligned:
@@ -238,7 +285,12 @@ def _process_utterance_single(out_dir, text, wav_path):
 
     if hparams.rescaling:
         wav = wav / np.abs(wav).max() * hparams.rescaling_max
-
+
+    if hparams.max_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) > hparams.max_audio_length:
+        return None
+    if hparams.min_audio_length != 0 and librosa.core.get_duration(y=wav, sr=sr) < hparams.min_audio_length:
+        return None
+
     # Compute the linear-scale spectrogram from the wav:
     spectrogram = audio.spectrogram(wav).astype(np.float32)
     n_frames = spectrogram.shape[1]

diff --git a/jsut.py b/jsut.py
@@ -21,11 +21,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
 
     for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
         futures.append(executor.submit(
-            partial(_process_utterance, out_dir, index + 1, wav_path, text)))
+            partial(_process_utterance, out_dir, index + 1, wav_path, text, hparams=hparams)))
     return [future.result() for future in tqdm(futures)]
 
 
 def _process_utterance(out_dir, index, wav_path, text):
+    audio.set_hparams(hparams)
     sr = hparams.sample_rate
 
     # Load the audio to a numpy array:

diff --git a/ljspeech.py b/ljspeech.py
@@ -30,12 +30,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
             wav_path = os.path.join(in_dir, 'wavs', '%s.wav' % parts[0])
             text = parts[2]
             futures.append(executor.submit(
-                partial(_process_utterance, out_dir, index, wav_path, text)))
+                partial(_process_utterance, out_dir, index, wav_path, text, hparams=hparams)))
             index += 1
     return [future.result() for future in tqdm(futures)]
 
 
-def _process_utterance(out_dir, index, wav_path, text):
+def _process_utterance(out_dir, index, wav_path, text, hparams=hparams):
     '''Preprocesses a single utterance audio/text pair.
 
     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
@@ -50,6 +50,7 @@ def _process_utterance(out_dir, index, wav_path, text):
     Returns:
       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
     '''
+    audio.set_hparams(hparams)
 
     # Load the audio to a numpy array:
     wav = audio.load_wav(wav_path)

diff --git a/nikl_m.py b/nikl_m.py
@@ -44,12 +44,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
             uid = re.search(r'([a-z][a-z][0-9][0-9]_t)', wav_path)
             uid = uid.group(1).replace('_t', '')
             futures.append(executor.submit(
-                partial(_process_utterance, out_dir, index + 1, spk_id[uid], wav_path, text)))
+                partial(_process_utterance, out_dir, index + 1, spk_id[uid], wav_path, text, hparams=hparams)))
             index += 1
     return [future.result() for future in tqdm(futures)]
 
 
-def _process_utterance(out_dir, index, speaker_id, wav_path, text):
+def _process_utterance(out_dir, index, speaker_id, wav_path, text, hparams):
     '''Preprocesses a single utterance audio/text pair.
 
     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
@@ -64,7 +64,8 @@ def _process_utterance(out_dir, index, speaker_id, wav_path, text):
     Returns:
       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
     '''
-
+    audio.set_hparams(hparams)
+
     # Load the audio to a numpy array:
     wav = audio.load_wav(wav_path)
 

diff --git a/nikl_s.py b/nikl_s.py
@@ -41,12 +41,12 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
                 wav_path = parts[0]
                 text = parts[1]
                 futures.append(executor.submit(
-                    partial(_process_utterance, out_dir, index + 1, wav_path, text)))
+                    partial(_process_utterance, out_dir, index + 1, wav_path, text, hparams=hparams)))
             index += 1
     return [future.result() for future in tqdm(futures)]
 
 
-def _process_utterance(out_dir, index, wav_path, text):
+def _process_utterance(out_dir, index, wav_path, text, hparams=hparams):
     '''Preprocesses a single utterance audio/text pair.
 
     This writes the mel and linear scale spectrograms to disk and returns a tuple to write
@@ -61,6 +61,7 @@ def _process_utterance(out_dir, index, wav_path, text):
     Returns:
       A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
     '''
+    audio.set_hparams(hparams)
 
     # Load the audio to a numpy array:
     wav = audio.load_wav(wav_path)

diff --git a/vctk.py b/vctk.py
@@ -25,7 +25,7 @@ def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
     for index, (speaker_id, text, wav_path) in enumerate(
             zip(speaker_ids, transcriptions, wav_paths)):
         futures.append(executor.submit(
-            partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text)))
+            partial(_process_utterance, out_dir, index + 1, speaker_id, wav_path, text, hparams=hparams)))
     return [future.result() for future in tqdm(futures)]
 
 
@@ -49,8 +49,9 @@ def end_at(labels):
     assert False
 
 
-def _process_utterance(out_dir, index, speaker_id, wav_path, text):
+def _process_utterance(out_dir, index, speaker_id, wav_path, text, hparams=hparams):
     sr = hparams.sample_rate
+    audio.set_hparams(hparams)
 
     # Load the audio to a numpy array:
     wav = audio.load_wav(wav_path)