Skip to content

Commit

Permalink
add support for WSJ corpus
Browse files Browse the repository at this point in the history
  • Loading branch information
nq555222 committed May 4, 2017
1 parent 76706da commit 5fdf7ce
Show file tree
Hide file tree
Showing 11 changed files with 516 additions and 16 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ End-to-end automatic speech recognition system implemented in TensorFlow.
- [x] **Add simple n-gram model for random generation or statistical use** (2017-03-23)
- [x] **Improve some code for pre-processing and training** (2017-03-23)
- [x] **Replace TABs with blanks and add nist2wav converter script** (2017-04-20)
- [x] **Add some data preparation code** (2017-05-1)
- [x] **Add some data preparation code** (2017-05-01)
- [x] **Add WSJ corpus standard preprocessing by s5 recipe** (2017-05-05)

## Recommendation
If you want to replace feed dict operation with Tensorflow multi-thread and fifoqueue input pipeline, you can refer to my repo [TensorFlow-Input-Pipeline](https://github.com/zzw922cn/TensorFlow-Input-Pipeline) for more example codes. My own practices prove that fifoqueue input pipeline would improve the training speed in some time.
Expand Down
22 changes: 22 additions & 0 deletions src/feature/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#-*- coding:utf-8 -*-
#!/usr/bin/python
''' Automatic Speech Recognition
author:
iiiiiiiiiiii iiiiiiiiiiii !!!!!!! !!!!!!
# ### # ### ### I# #:
# ### # I##; ##; ## ##
### ### !## #### #
### ### ### ## ### #'
!##; `##% ##; ## ### ##
### ### $## `# ## #
### # ### # #### ####;
`### -# ### `# ### ###
############## ############## `# #
date:2017-4-15
'''

import sys
sys.path.append('../')
sys.dont_write_bytecode = True
183 changes: 183 additions & 0 deletions src/feature/core/calcmfcc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#-*- coding:utf-8 -*-
#!/usr/bin/python

''' calculate mfcc feature vectors.
author:
iiiiiiiiiiii iiiiiiiiiiii !!!!!!! !!!!!!
# ### # ### ### I# #:
# ### # I##; ##; ## ##
### ### !## #### #
### ### ### ## ### #'
!##; `##% ##; ## ### ##
### ### $## `# ## #
### # ### # #### ####;
`### -# ### `# ### ###
############## ############## `# #
date:2016-11-09
'''


import numpy
from sigprocess import audio2frame
from sigprocess import pre_emphasis
from sigprocess import spectrum_power
from scipy.fftpack import dct
#首先,为了适配版本3.x,需要调整xrange的使用,因为对于版本2.x只能使用range,需要将xrange替换为range
try:
xrange(1)
except:
xrange=range



def calcMFCC_delta_delta(signal,samplerate=16000,win_length=0.025,win_step=0.01,cep_num=13,filters_num=26,NFFT=512,low_freq=0,high_freq=None,pre_emphasis_coeff=0.97,cep_lifter=22,appendEnergy=True):
'''计算13个MFCC+13个一阶微分系数+13个加速系数,一共39个系数
'''
feat=calcMFCC(signal,samplerate,win_length,win_step,cep_num,filters_num,NFFT,low_freq,high_freq,pre_emphasis_coeff,cep_lifter,appendEnergy) #首先获取13个一般MFCC系数
feat_delta=delta(feat)
feat_delta_delta=delta(feat_delta)

result=numpy.concatenate((feat,feat_delta,feat_delta_delta),axis=1)
return result

def delta(feat, N=2):
"""Compute delta features from a feature vector sequence.
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
:param N: For each frame, calculate delta features based on preceding and following N frames
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
"""
NUMFRAMES = len(feat)
feat = numpy.concatenate(([feat[0] for i in range(N)], feat, [feat[-1] for i in range(N)]))
denom = sum([2*i*i for i in range(1,N+1)])
dfeat = []
for j in range(NUMFRAMES):
dfeat.append(numpy.sum([n*feat[N+j+n] for n in range(-1*N,N+1)], axis=0)/denom)
return dfeat

def calcMFCC(signal,samplerate=16000,win_length=0.025,win_step=0.01,cep_num=13,filters_num=26,NFFT=512,low_freq=0,high_freq=None,pre_emphasis_coeff=0.97,cep_lifter=22,appendEnergy=True):
'''计算13个MFCC系数
参数含义:
signal:原始音频信号,一般为.wav格式文件
samplerate:抽样频率,这里默认为16KHz
win_length:窗长度,默认即一帧为25ms
win_step:窗间隔,默认情况下即相邻帧开始时刻之间相隔10ms
cep_num:倒谱系数的个数,默认为13
filters_num:滤波器的个数,默认为26
NFFT:傅立叶变换大小,默认为512
low_freq:最低频率,默认为0
high_freq:最高频率
pre_emphasis_coeff:预加重系数,默认为0.97
cep_lifter:倒谱的升个数
appendEnergy:是否加上能量,默认加
'''

feat,energy=fbank(signal,samplerate,win_length,win_step,filters_num,NFFT,low_freq,high_freq,pre_emphasis_coeff)
feat=numpy.log(feat)
feat=dct(feat,type=2,axis=1,norm='ortho')[:,:cep_num] #进行离散余弦变换,只取前13个系数
feat=lifter(feat,cep_lifter)
if appendEnergy:
feat[:,0]=numpy.log(energy) #只取2-13个系数,第一个用能量的对数来代替
return feat

def fbank(signal,samplerate=16000,win_length=0.025,win_step=0.01,filters_num=26,NFFT=512,low_freq=0,high_freq=None,pre_emphasis_coeff=0.97):
'''计算音频信号的MFCC
参数说明:
samplerate:采样频率
win_length:窗长度
win_step:窗间隔
filters_num:梅尔滤波器个数
NFFT:FFT大小
low_freq:最低频率
high_freq:最高频率
pre_emphasis_coeff:预加重系数
'''

high_freq=high_freq or samplerate/2 #计算音频样本的最大频率
signal=pre_emphasis(signal,pre_emphasis_coeff) #对原始信号进行预加重处理
frames=audio2frame(signal,win_length*samplerate,win_step*samplerate) #得到帧数组
spec_power=spectrum_power(frames,NFFT) #得到每一帧FFT以后的功率谱
energy=numpy.sum(spec_power,1) #对每一帧的功率谱进行求和,得到能量
energy=numpy.where(energy==0,numpy.finfo(float).eps,energy) #对能量为0的地方调整为eps,这样便于进行对数处理
fb=get_filter_banks(filters_num,NFFT,samplerate,low_freq,high_freq) #获得每一个滤波器的频率宽度
feat=numpy.dot(spec_power,fb.T) #对滤波器和功率谱进行点乘
feat=numpy.where(feat==0,numpy.finfo(float).eps,feat) #同样不能出现0
return feat,energy

def log_fbank(signal,samplerate=16000,win_length=0.025,win_step=0.01,filters_num=26,NFFT=512,low_freq=0,high_freq=None,pre_emphasis_coeff=0.97):
'''计算对数值
参数含义:同上
'''
feat,energy=fbank(signal,samplerate,win_length,win_step,filters_num,NFFT,low_freq,high_freq,pre_emphasis_coeff)
return numpy.log(feat)

def ssc(signal,samplerate=16000,win_length=0.025,win_step=0.01,filters_num=26,NFFT=512,low_freq=0,high_freq=None,pre_emphasis_coeff=0.97):
'''
待补充
'''
high_freq=high_freq or samplerate/2
signal=sigprocess.pre_emphasis(signal,pre_emphasis_coeff)
frames=sigprocess.audio2frame(signal,win_length*samplerate,win_step*samplerate)
spec_power=sigprocess.spectrum_power(frames,NFFT)
spec_power=numpy.where(spec_power==0,numpy.finfo(float).eps,spec_power) #能量谱
fb=get_filter_banks(filters_num,NFFT,samplerate,low_freq,high_freq)
feat=numpy.dot(spec_power,fb.T) #计算能量
R=numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(spec_power,1)),(numpy.size(spec_power,0),1))
return numpy.dot(spec_power*R,fb.T)/feat

def hz2mel(hz):
'''把频率hz转化为梅尔频率
参数说明:
hz:频率
'''
return 2595*numpy.log10(1+hz/700.0)

def mel2hz(mel):
'''把梅尔频率转化为hz
参数说明:
mel:梅尔频率
'''
return 700*(10**(mel/2595.0)-1)

def get_filter_banks(filters_num=20,NFFT=512,samplerate=16000,low_freq=0,high_freq=None):
'''计算梅尔三角间距滤波器,该滤波器在第一个频率和第三个频率处为0,在第二个频率处为1
参数说明:
filers_num:滤波器个数
NFFT:FFT大小
samplerate:采样频率
low_freq:最低频率
high_freq:最高频率
'''
#首先,将频率hz转化为梅尔频率,因为人耳分辨声音的大小与频率并非线性正比,所以化为梅尔频率再线性分隔
low_mel=hz2mel(low_freq)
high_mel=hz2mel(high_freq)
#需要在low_mel和high_mel之间等间距插入filters_num个点,一共filters_num+2个点
mel_points=numpy.linspace(low_mel,high_mel,filters_num+2)
#再将梅尔频率转化为hz频率,并且找到对应的hz位置
hz_points=mel2hz(mel_points)
#我们现在需要知道这些hz_points对应到fft中的位置
bin=numpy.floor((NFFT+1)*hz_points/samplerate)
#接下来建立滤波器的表达式了,每个滤波器在第一个点处和第三个点处均为0,中间为三角形形状
fbank=numpy.zeros([filters_num,NFFT/2+1])
for j in xrange(0,filters_num):
for i in xrange(int(bin[j]),int(bin[j+1])):
fbank[j,i]=(i-bin[j])/(bin[j+1]-bin[j])
for i in xrange(int(bin[j+1]),int(bin[j+2])):
fbank[j,i]=(bin[j+2]-i)/(bin[j+2]-bin[j+1])
return fbank

def lifter(cepstra,L=22):
'''升倒谱函数
参数说明:
cepstra:MFCC系数
L:升系数,默认为22
'''
if L>0:
nframes,ncoeff=numpy.shape(cepstra)
n=numpy.arange(ncoeff)
lift=1+(L/2)*numpy.sin(numpy.pi*n/L)
return lift*cepstra
else:
return cepstra
21 changes: 21 additions & 0 deletions src/feature/core/fileUtils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#-*- coding:utf-8 -*-
#!/usr/bin/python
''' Automatic Speech Recognition
author(s):
zzw922cn
date:2017-5-5
'''

import sys
sys.path.append('../')
sys.dont_write_bytecode = True

import os

def check_path_exists(path):
""" check a path exists or not
"""
if not os.path.exists(path):
os.makedirs(path)
20 changes: 20 additions & 0 deletions src/feature/core/nist2wav.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/python

import sys
sys.path.append('../')
sys.dont_write_bytecode = True

import subprocess
import os

def nist2wav(src_dir):
count = 0
for subdir, dirs, files in os.walk(src_dir):
for f in files:
fullFilename = os.path.join(subdir, f)
if f.endswith('.wv1') or f.endswith('.wv2'):
count += 1
os.system("./sph2pipe_v2.5/sph2pipe "+fullFilename+" -f rif " +fullFilename+".wav")
print fullFilename

nist2wav('/home/pony/wsj/')
24 changes: 24 additions & 0 deletions src/feature/core/nist2wav.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash

# author:Rongyi Lan
# email:brianlanbo@gmail.com

# This file is designed for converting NIST format audio
# to WAV format audio, to run this script, you should install
# libsndfile software first.


target_dir=$1

fnames=(`find $target_dir -name "*.wv1"`)

for fname in "${fnames[@]}"
do
mv "$fname" "${fname%.wav}.nist"
sndfile-convert "${fname%.wav}.nist" "$fname"
if [ $? = 0 ]; then
echo renamed $fname to nist and converted back to wav using sndfile-convert
else
mv "${fname%.wav}.nist" "$fname"
fi
done
Loading

0 comments on commit 5fdf7ce

Please sign in to comment.