Skip to content

Add NoisePerturbAugmentor and CHiME3 data preparation. #140

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 8, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified deep_speech_2/README.md
100755 → 100644
Empty file.
8 changes: 8 additions & 0 deletions deep_speech_2/conf/augmentation.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[
{
"type": "shift",
"params": {"min_shift_ms": -5,
"max_shift_ms": 5},
"prob": 1.0
}
]
39 changes: 39 additions & 0 deletions deep_speech_2/conf/augmentation.config.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
[
{
"type": "noise",
"params": {"min_snr_dB": 40,
"max_snr_dB": 50,
"noise_manifest_path": "datasets/manifest.noise"},
"prob": 0.6
},
{
"type": "impulse",
"params": {"impulse_manifest_path": "datasets/manifest.impulse"},
"prob": 0.5
},
{
"type": "speed",
"params": {"min_speed_rate": 0.95,
"max_speed_rate": 1.05},
"prob": 0.5
},
{
"type": "shift",
"params": {"min_shift_ms": -5,
"max_shift_ms": 5},
"prob": 1.0
},
{
"type": "volume",
"params": {"min_gain_dBFS": -10,
"max_gain_dBFS": 10},
"prob": 0.0
},
{
"type": "bayesian_normal",
"params": {"target_db": -20,
"prior_db": -20,
"prior_samples": 100},
"prob": 0.0
}
]
8 changes: 4 additions & 4 deletions deep_speech_2/data_utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def superimpose(self, other):
:raise ValueError: If the sample rates of the two segments are not
equal, or if the lengths of segments don't match.
"""
if type(self) != type(other):
if isinstance(other, type(self)):
raise TypeError("Cannot add segments of different types: %s "
"and %s." % (type(self), type(other)))
if self._sample_rate != other._sample_rate:
Expand All @@ -231,7 +231,7 @@ def gain_db(self, gain):
Note that this is an in-place transformation.

:param gain: Gain in decibels to apply to samples.
:type gain: float
:type gain: float|1darray
"""
self._samples *= 10.**(gain / 20.)

Expand Down Expand Up @@ -457,9 +457,9 @@ def convolve(self, impulse_segment, allow_resample=False):
audio segments when resample is not allowed.
"""
if allow_resample and self.sample_rate != impulse_segment.sample_rate:
impulse_segment = impulse_segment.resample(self.sample_rate)
impulse_segment.resample(self.sample_rate)
if self.sample_rate != impulse_segment.sample_rate:
raise ValueError("Impulse segment's sample rate (%d Hz) is not"
raise ValueError("Impulse segment's sample rate (%d Hz) is not "
"equal to base signal sample rate (%d Hz)." %
(impulse_segment.sample_rate, self.sample_rate))
samples = signal.fftconvolve(self.samples, impulse_segment.samples,
Expand Down
55 changes: 43 additions & 12 deletions deep_speech_2/data_utils/augmentor/augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor
from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor
from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor
from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor
from data_utils.augmentor.resample import ResampleAugmentor
from data_utils.augmentor.online_bayesian_normalization import \
OnlineBayesianNormalizationAugmentor
Expand All @@ -23,21 +25,46 @@ class AugmentationPipeline(object):
string, e.g.

.. code-block::

'[{"type": "volume",
"params": {"min_gain_dBFS": -15,
"max_gain_dBFS": 15},
"prob": 0.5},
{"type": "speed",
"params": {"min_speed_rate": 0.8,
"max_speed_rate": 1.2},
"prob": 0.5}
]'

[ {
"type": "noise",
"params": {"min_snr_dB": 10,
"max_snr_dB": 20,
"noise_manifest_path": "datasets/manifest.noise"},
"prob": 0.0
},
{
"type": "speed",
"params": {"min_speed_rate": 0.9,
"max_speed_rate": 1.1},
"prob": 1.0
},
{
"type": "shift",
"params": {"min_shift_ms": -5,
"max_shift_ms": 5},
"prob": 1.0
},
{
"type": "volume",
"params": {"min_gain_dBFS": -10,
"max_gain_dBFS": 10},
"prob": 0.0
},
{
"type": "bayesian_normal",
"params": {"target_db": -20,
"prior_db": -20,
"prior_samples": 100},
"prob": 0.0
}
]

This augmentation configuration inserts two augmentation models
into the pipeline, with one is VolumePerturbAugmentor and the other
SpeedPerturbAugmentor. "prob" indicates the probability of the current
augmentor to take effect.
augmentor to take effect. If "prob" is zero, the augmentor does not take
effect.

:param augmentation_config: Augmentation configuration in json string.
:type augmentation_config: str
Expand All @@ -60,7 +87,7 @@ def transform_audio(self, audio_segment):
:type audio_segment: AudioSegmenet|SpeechSegment
"""
for augmentor, rate in zip(self._augmentors, self._rates):
if self._rng.uniform(0., 1.) <= rate:
if self._rng.uniform(0., 1.) < rate:
augmentor.transform_audio(audio_segment)

def _parse_pipeline_from(self, config_json):
Expand Down Expand Up @@ -89,5 +116,9 @@ def _get_augmentor(self, augmentor_type, params):
return ResampleAugmentor(self._rng, **params)
elif augmentor_type == "bayesian_normal":
return OnlineBayesianNormalizationAugmentor(self._rng, **params)
elif augmentor_type == "noise":
return NoisePerturbAugmentor(self._rng, **params)
elif augmentor_type == "impulse":
return ImpulseResponseAugmentor(self._rng, **params)
else:
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
35 changes: 35 additions & 0 deletions deep_speech_2/data_utils/augmentor/impulse_response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""Contains the impulse response augmentation model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from data_utils.augmentor.base import AugmentorBase
from data_utils import utils
from data_utils.audio import AudioSegment


class ImpulseResponseAugmentor(AugmentorBase):
"""Augmentation model for adding impulse response effect.

:param rng: Random generator object.
:type rng: random.Random
:param impulse_manifest_path: Manifest path for impulse audio data.
:type impulse_manifest_path: basestring
"""

def __init__(self, rng, impulse_manifest_path):
self._rng = rng
self._impulse_manifest = utils.read_manifest(
manifest_path=impulse_manifest_path)

def transform_audio(self, audio_segment):
"""Add impulse response effect.

Note that this is an in-place transformation.

:param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
impulse_json = self._rng.sample(self._impulse_manifest, 1)[0]
impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath'])
audio_segment.convolve(impulse_segment, allow_resample=True)
50 changes: 50 additions & 0 deletions deep_speech_2/data_utils/augmentor/noise_perturb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Contains the noise perturb augmentation model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from data_utils.augmentor.base import AugmentorBase
from data_utils import utils
from data_utils.audio import AudioSegment


class NoisePerturbAugmentor(AugmentorBase):
"""Augmentation model for adding background noise.

:param rng: Random generator object.
:type rng: random.Random
:param min_snr_dB: Minimal signal noise ratio, in decibels.
:type min_snr_dB: float
:param max_snr_dB: Maximal signal noise ratio, in decibels.
:type max_snr_dB: float
:param noise_manifest_path: Manifest path for noise audio data.
:type noise_manifest_path: basestring
"""

def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path):
self._min_snr_dB = min_snr_dB
self._max_snr_dB = max_snr_dB
self._rng = rng
self._noise_manifest = utils.read_manifest(
manifest_path=noise_manifest_path)

def transform_audio(self, audio_segment):
"""Add background noise audio.

Note that this is an in-place transformation.

:param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
noise_json = self._rng.sample(self._noise_manifest, 1)[0]
if noise_json['duration'] < audio_segment.duration:
raise RuntimeError("The duration of sampled noise audio is smaller "
"than the audio segment to add effects to.")
diff_duration = noise_json['duration'] - audio_segment.duration
start = self._rng.uniform(0, diff_duration)
end = start + audio_segment.duration
noise_segment = AudioSegment.slice_from_file(
noise_json['audio_filepath'], start=start, end=end)
snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB)
audio_segment.add_noise(
noise_segment, snr_dB, allow_downsampling=True, rng=self._rng)
Empty file.
Empty file modified deep_speech_2/data_utils/augmentor/resample.py
100755 → 100644
Empty file.
2 changes: 1 addition & 1 deletion deep_speech_2/data_utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def batch_reader():
manifest, batch_size, clipped=True)
elif shuffle_method == "instance_shuffle":
self._rng.shuffle(manifest)
elif not shuffle_method:
elif shuffle_method == None:
pass
else:
raise ValueError("Unknown shuffle method %s." %
Expand Down
2 changes: 1 addition & 1 deletion deep_speech_2/data_utils/speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def slice_from_file(cls, filepath, transcript, start=None, end=None):
speech file.
:rtype: SpeechSegment
"""
audio = Audiosegment.slice_from_file(filepath, start, end)
audio = AudioSegment.slice_from_file(filepath, start, end)
return cls(audio.samples, audio.sample_rate, transcript)

@classmethod
Expand Down
128 changes: 128 additions & 0 deletions deep_speech_2/datasets/noise/chime3_background.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""Prepare CHiME3 background data.

Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import distutils.util
import os
import wget
import zipfile
import argparse
import soundfile
import json
from paddle.v2.dataset.common import md5file

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ"
MD5 = "c3ff512618d7a67d4f85566ea1bc39ec"

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/chime3_background",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_filepath",
default="manifest.chime3.background",
type=str,
help="Filepath for output manifests. (default: %(default)s)")
args = parser.parse_args()


def download(url, md5sum, target_dir, filename=None):
"""Download file from url to target_dir, and check md5sum."""
if filename == None:
filename = url.split("/")[-1]
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, filename)
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("\nMD5 Chesksum %s ..." % filepath)
if not md5file(filepath) == md5sum:
raise RuntimeError("MD5 checksum failed.")
else:
print("File exists, skip downloading. (%s)" % filepath)
return filepath


def unpack(filepath, target_dir):
"""Unpack the file to the target_dir."""
print("Unpacking %s ..." % filepath)
if filepath.endswith('.zip'):
zip = zipfile.ZipFile(filepath, 'r')
zip.extractall(target_dir)
zip.close()
elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'):
tar = zipfile.open(filepath)
tar.extractall(target_dir)
tar.close()
else:
raise ValueError("File format is not supported for unpacking.")


def create_manifest(data_dir, manifest_path):
"""Create a manifest json file summarizing the data set, with each line
containing the meta data (i.e. audio filepath, transcription text, audio
duration) of each audio file within the data set.
"""
print("Creating manifest %s ..." % manifest_path)
json_lines = []
for subfolder, _, filelist in sorted(os.walk(data_dir)):
for filename in filelist:
if filename.endswith('.wav'):
filepath = os.path.join(data_dir, subfolder, filename)
audio_data, samplerate = soundfile.read(filepath)
duration = float(len(audio_data)) / samplerate
json_lines.append(
json.dumps({
'audio_filepath': filepath,
'duration': duration,
'text': ''
}))
with open(manifest_path, 'w') as out_file:
for line in json_lines:
out_file.write(line + '\n')


def prepare_chime3(url, md5sum, target_dir, manifest_path):
"""Download, unpack and create summmary manifest file."""
if not os.path.exists(os.path.join(target_dir, "CHiME3")):
# download
filepath = download(url, md5sum, target_dir,
"myairbridge-AG0Y3DNBE5IWRRTV.zip")
# unpack
unpack(filepath, target_dir)
unpack(
os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir)
unpack(
os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir)
unpack(
os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir)
unpack(
os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)
# create manifest json file
create_manifest(target_dir, manifest_path)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we should do some checking here to make sure existence of audio files and transcription text files

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

不需要,第一,noise文件不存在transcription;第二,这里的逻辑是是遍历所有的音频文件(不存在就不会被遍历到)。



def main():
prepare_chime3(
url=URL,
md5sum=MD5,
target_dir=args.target_dir,
manifest_path=args.manifest_filepath)


if __name__ == '__main__':
main()
Loading