add DeepSpeech2

gdy1201 · Jul 23, 2017 · 5956b5c · 5956b5c
1 parent 0cdd168
commit 5956b5c
Show file tree

Hide file tree

Showing 5 changed files with 49 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -17,6 +17,8 @@ End-to-end automatic speech recognition system implemented in TensorFlow.
 - [x] **Add WSJ corpus standard preprocessing by s5 recipe** (2017-05-05)
 - [x] **Restructuring of the project. Updated train.py for usage convinience** (2017-05-06)
 - [x] **Finish feature module for timit, libri, wsj, support training for LibriSpeech** (2017-05-14)
+- [x] **Remove some unnecessary codes** (2017-07-22)
+- [x] **Add DeepSpeech2 implementation code** (2017-07-23)
 
 ## Recommendation
 If you want to replace feed dict operation with Tensorflow multi-thread and fifoqueue input pipeline, you can refer to my repo [TensorFlow-Input-Pipeline](https://github.com/zzw922cn/TensorFlow-Input-Pipeline) for more example codes. My own practices prove that fifoqueue input pipeline would improve the training speed in some time.

diff --git a/feature/core/spectrogram.py b/feature/core/spectrogram.py
@@ -0,0 +1,45 @@
+# -*- coding:utf-8 -*-
+
+""" Calculate the spectrogram of power of an audio file (.wav format)
+
+
+@author: zhang zewang
+@date: 2017-7-22
+"""
+
+import numpy as np
+import scipy.io.wavfile as wav
+import librosa
+from sklearn import preprocessing
+
+def spectrogramPower(audio, window_size=0.02, window_stride=0.01):
+    """ short time fourier transform
+
+    Details:
+        audio - This is the input time-domain signal you wish to find the spectrogram of. It can't get much simpler than that. In your case, the 
+                signal you want to find the spectrogram of is defined in the following code:
+
+        win_length - If you recall, we decompose the image into chunks, and each chunk has a specified width.  window defines the width of each 
+                 chunkin terms of samples. As this is a discrete-time signal, you know that this signal was sampled with a particular sampling 
+                 frequency and sampling period. You can determine how large the window is in terms of samples by:
+
+                 window_samples = window_time/Ts
+        hop_length - the same as stride in convolution network, overlapping width
+
+    """
+    samplingRate, samples = wav.read(audio)
+    win_length = int(window_size * samplingRate)
+    hop_length = int(window_stride * samplingRate)
+    n_fft = win_length
+    D = librosa.core.stft(samples, n_fft=n_fft,hop_length=hop_length,
+                      win_length=win_length)
+    mag = np.abs(D)
+    log_mag = np.log1p(mag)
+    # normalization
+    log_mag = preprocessing.scale(log_mag)
+    # size: frequency_bins*time_len
+    return log_mag
+
+
+if __name__ == '__main__':
+    print np.shape(spectrogramPower('test.wav'))
diff --git a/feature/core/statistic.py b/feature/core/statistic.py
diff --git a/feature/core/test.wav b/feature/core/test.wav
diff --git a/feature/timit/timit_preprocess.py b/feature/timit/timit_preprocess.py
@@ -27,6 +27,7 @@
 import argparse
 from core.sigprocess import *
 from core.calcmfcc import calcfeat_delta_delta
+from core.spectrogram import spectrogramPower
 import scipy.io.wavfile as wav
 import numpy as np
 import glob
@@ -42,7 +43,7 @@
 ## cleaned phonemes
 #phn = ['sil', 'aa', 'ae', 'ah', 'ao', 'aw', 'ax', 'ax-h', 'ay', 'b', 'ch', 'd', 'dh', 'dx', 'eh', 'el', 'en', 'epi', 'er', 'ey', 'f', 'g', 'hh', 'ih', 'ix', 'iy', 'jh', 'k', 'l', 'm', 'n', 'ng', 'ow', 'oy', 'p', 'q', 'r', 's', 'sh', 't', 'th', 'uh', 'uw', 'v', 'w', 'y', 'z', 'zh']
 
-def wav2feature(rootdir, save_directory, mode, feature_len,level, keywords, win_len, win_step,  seq2seq, save):
+def wav2feature(rootdir, save_directory, mode, feature_len, level, keywords, win_len, win_step,  seq2seq, save):
     feat_dir = os.path.join(save_directory, level, keywords, mode)
     label_dir = os.path.join(save_directory, level, keywords, 'label')
     if not os.path.exists(label_dir):