Skip to content

Commit

Permalink
Init
Browse files Browse the repository at this point in the history
  • Loading branch information
egorsmkv committed May 20, 2022
0 parents commit d1bd55f
Show file tree
Hide file tree
Showing 8 changed files with 357 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__pycache__/
.idea/
.hypothesis/
.pytest_cache/
data/
speech/
.vscode/

21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2022 Yehor Smoliakov

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# ASR Corpus by Microphone

## Overview

This repository contains code to run a script that collects speech data from your microphone.

Watch video below to see how it works:

## Installation

Install Python requirements:

```
pip install wave torch torchaudio pyaudio
```

## Running

```
# Create folders for work
mkdir data
mkdir speech
python record_and_split.py
```

## Acknowledgements

- Silero VAD: https://github.com/snakers4/silero-vad
- PyAudio: https://people.csail.mit.edu/hubert/pyaudio/
- wave: https://pythonhosted.org/Wave/
9 changes: 9 additions & 0 deletions licenses/PyAudio-LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
PyAudio is distributed under the MIT License:
Copyright (c) 2006 Hubert Pham

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

21 changes: 21 additions & 0 deletions licenses/Silero-LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2020-present Silero Team

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
80 changes: 80 additions & 0 deletions record_and_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import torch
import time
import pyaudio
import wave
import os

from utils_vad import get_speech_timestamps, read_audio, init_jit_model, save_audio

# Fix the number of threads (as it is shown in Silero demos)
torch.set_num_threads(1)

# Init the model
model = init_jit_model('./silero_vad.jit')
window_size_samples = 512

# Config for microphone
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
RECORD_SECONDS = 20


def split_speech(filename):
wav = read_audio(filename, sampling_rate=16000)
speech_timestamps = get_speech_timestamps(wav,
model,
sampling_rate=16000,
window_size_samples=window_size_samples)
for idx, ts in enumerate(speech_timestamps):
segment = wav[ts['start'] : ts['end']]
new_filename = filename.replace('data/','').replace('.wav', f'_{idx}.wav')
save_dir = f'speech/{new_filename}'

save_audio(save_dir, segment)

print(f'File {save_dir} saved')

os.remove(filename)


def run():
while True:
output_filename = f"data/recording_{time.time()}.wav"

p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)

# Recording audio
print("* recording")
frames = []
for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

# Save a recording into a WAV file
wf = wave.open(output_filename, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()

# Split the recording by VAD
split_speech(output_filename)

print('Done, going to the next recording...')


if __name__ == '__main__':
run()
Binary file added silero_vad.jit
Binary file not shown.
187 changes: 187 additions & 0 deletions utils_vad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import torch
import torchaudio
from typing import List
import torch.nn.functional as F
import warnings


def read_audio(path: str, sampling_rate: int = 16000):
wav, sr = torchaudio.load(path)

if wav.size(0) > 1:
wav = wav.mean(dim=0, keepdim=True)

if sr != sampling_rate:
transform = torchaudio.transforms.Resample(orig_freq=sr,
new_freq=sampling_rate)
wav = transform(wav)
sr = sampling_rate

assert sr == sampling_rate
return wav.squeeze(0)


def save_audio(path: str,
tensor: torch.Tensor,
sampling_rate: int = 16000):
torchaudio.save(path, tensor.unsqueeze(0), sampling_rate)


def init_jit_model(model_path: str,
device=torch.device('cpu')):
torch.set_grad_enabled(False)
model = torch.jit.load(model_path, map_location=device)
model.eval()
return model


def get_speech_timestamps(audio: torch.Tensor,
model,
threshold: float = 0.5,
sampling_rate: int = 16000,
min_speech_duration_ms: int = 250,
min_silence_duration_ms: int = 100,
window_size_samples: int = 1536,
speech_pad_ms: int = 30,
return_seconds: bool = False):
"""
This method is used for splitting long audios into speech chunks using silero VAD
Parameters
----------
audio: torch.Tensor, one dimensional
One dimensional float torch.Tensor, other types are casted to torch if possible
model: preloaded .jit silero VAD model
threshold: float (default - 0.5)
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
sampling_rate: int (default - 16000)
Currently silero VAD models support 8000 and 16000 sample rates
min_speech_duration_ms: int (default - 250 milliseconds)
Final speech chunks shorter min_speech_duration_ms are thrown out
min_silence_duration_ms: int (default - 100 milliseconds)
In the end of each speech chunk wait for min_silence_duration_ms before separating it
window_size_samples: int (default - 1536 samples)
Audio chunks of window_size_samples size are fed to the silero VAD model.
WARNING! Silero VAD models were trained using 512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples for 8000 sample rate.
Values other than these may affect model perfomance!!
speech_pad_ms: int (default - 30 milliseconds)
Final speech chunks are padded by speech_pad_ms each side
return_seconds: bool (default - False)
whether return timestamps in seconds (default - samples)
visualize_probs: bool (default - False)
whether draw prob hist or not
Returns
----------
speeches: list of dicts
list containing ends and beginnings of speech chunks (samples or seconds based on return_seconds)
"""

if not torch.is_tensor(audio):
try:
audio = torch.Tensor(audio)
except:
raise TypeError("Audio cannot be casted to tensor. Cast it manually")

if len(audio.shape) > 1:
for i in range(len(audio.shape)): # trying to squeeze empty dimensions
audio = audio.squeeze(0)
if len(audio.shape) > 1:
raise ValueError("More than one dimension in audio. Are you trying to process audio with 2 channels?")

if sampling_rate > 16000 and (sampling_rate % 16000 == 0):
step = sampling_rate // 16000
sampling_rate = 16000
audio = audio[::step]
warnings.warn('Sampling rate is a multiply of 16000, casting to 16000 manually!')
else:
step = 1

if sampling_rate == 8000 and window_size_samples > 768:
warnings.warn(
'window_size_samples is too big for 8000 sampling_rate! Better set window_size_samples to 256, 512 or 768 for 8000 sample rate!')
if window_size_samples not in [256, 512, 768, 1024, 1536]:
warnings.warn(
'Unusual window_size_samples! Supported window_size_samples:\n - [512, 1024, 1536] for 16000 sampling_rate\n - [256, 512, 768] for 8000 sampling_rate')

model.reset_states()
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
speech_pad_samples = sampling_rate * speech_pad_ms / 1000

audio_length_samples = len(audio)

speech_probs = []
for current_start_sample in range(0, audio_length_samples, window_size_samples):
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
if len(chunk) < window_size_samples:
chunk = torch.nn.functional.pad(chunk, (0, int(window_size_samples - len(chunk))))
speech_prob = model(chunk, sampling_rate).item()
speech_probs.append(speech_prob)

triggered = False
speeches = []
current_speech = {}
neg_threshold = threshold - 0.15
temp_end = 0

for i, speech_prob in enumerate(speech_probs):
if (speech_prob >= threshold) and temp_end:
temp_end = 0

if (speech_prob >= threshold) and not triggered:
triggered = True
current_speech['start'] = window_size_samples * i
continue

if (speech_prob < neg_threshold) and triggered:
if not temp_end:
temp_end = window_size_samples * i
if (window_size_samples * i) - temp_end < min_silence_samples:
continue
else:
current_speech['end'] = temp_end
if (current_speech['end'] - current_speech['start']) > min_speech_samples:
speeches.append(current_speech)
temp_end = 0
current_speech = {}
triggered = False
continue

if current_speech and (audio_length_samples - current_speech['start']) > min_speech_samples:
current_speech['end'] = audio_length_samples
speeches.append(current_speech)

for i, speech in enumerate(speeches):
if i == 0:
speech['start'] = int(max(0, speech['start'] - speech_pad_samples))
if i != len(speeches) - 1:
silence_duration = speeches[i + 1]['start'] - speech['end']
if silence_duration < 2 * speech_pad_samples:
speech['end'] += int(silence_duration // 2)
speeches[i + 1]['start'] = int(max(0, speeches[i + 1]['start'] - silence_duration // 2))
else:
speech['end'] += int(speech_pad_samples)
else:
speech['end'] = int(min(audio_length_samples, speech['end'] + speech_pad_samples))

if return_seconds:
for speech_dict in speeches:
speech_dict['start'] = round(speech_dict['start'] / sampling_rate, 1)
speech_dict['end'] = round(speech_dict['end'] / sampling_rate, 1)
elif step > 1:
for speech_dict in speeches:
speech_dict['start'] *= step
speech_dict['end'] *= step

return speeches

0 comments on commit d1bd55f

Please sign in to comment.