Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.idea/
*.pyc
__pycache__/
.ipynb_checkpoints/
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ epoch 360, loss_tr=0.033095 err_tr=0.009600 loss_te=4.254683 err_te=0.419954 err
The converge is initially very fast (see the first 30 epochs). After that the performance improvement decreases and oscillations into the sentence error rate performance appear. Despite these oscillations an average improvement trend can be observed for the subsequent epochs. In this experiment, we stopped our training at epoch 360.
The fields of the res.res file have the following meaning:
- loss_tr: is the average training loss (i.e., cross-entropy function) computed at every frame.
- err_tr: is the classification error (measured at frame level) of the training data. Note that we split the speech signals into chunks of 200ms with 10ms overlap. The error is averaged for all the chunks of the training dataset.
- err_tr: is the classification error (measured at frame level) of the training data. Note that we split the speech signals into chunks of 200ms with 190ms overlap. The error is averaged for all the chunks of the training dataset.
- loss_te is the average test loss (i.e., cross-entropy function) computed at every frame.
- err_te: is the classification error (measured at frame level) of the test data.
- err_te_snt: is the classification error (measured at sentence level) of the test data. Note that we split the speech signal into chunks of 200ms with 10ms overlap. For each chunk, our SincNet performs a prediction over the set of speakers. To compute this classification error rate we averaged the predictions and, for each sentence, we voted for the speaker with the highest average probability.
- err_te_snt: is the classification error (measured at sentence level) of the test data. Note that we split the speech signal into chunks of 200ms with 190ms overlap. For each chunk, our SincNet performs a prediction over the set of speakers. To compute this classification error rate we averaged the predictions and, for each sentence, we voted for the speaker with the highest average probability.

[You can find our trained model for TIMIT here.](https://bitbucket.org/mravanelli/sincnet_models/)

Expand Down
99 changes: 49 additions & 50 deletions TIMIT_preparation.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
#!/usr/bin/env python3

# TIMIT_preparation
# Mirco Ravanelli
# Mila - University of Montreal
# TIMIT_preparation
# Mirco Ravanelli
# Mila - University of Montreal

# July 2018

# Description:
# This code prepares TIMIT for the following speaker identification experiments.
# Description:
# This code prepares TIMIT for the following speaker identification experiments.
# It removes start and end silences according to the information reported in the *.wrd files and normalizes the amplitude of each sentence.

# How to run it:
# python TIMIT_preparation.py $TIMIT_FOLDER $OUTPUT_FOLDER data_lists/TIMIT_all.scp
# python TIMIT_preparation.py $TIMIT_FOLDER $OUTPUT_FOLDER data_lists/TIMIT_all.scp

# NOTE: This script expects filenames in lowercase (e.g, train/dr1/fcjf0/si1027.wav" rather than "TRAIN/DR1/FCJF0/SI1027.WAV)

Expand All @@ -22,59 +22,58 @@
import numpy as np
import sys


def ReadList(list_file):
f=open(list_file,"r")
lines=f.readlines()
list_sig=[]
for x in lines:
f = open(list_file, "r")
lines = f.readlines()
list_sig = []
for x in lines:
list_sig.append(x.rstrip())
f.close()
return list_sig
f.close()
return list_sig

def copy_folder(in_folder,out_folder):
if not(os.path.isdir(out_folder)):
shutil.copytree(in_folder, out_folder, ignore=ig_f)

def ig_f(dir, files):
return [f for f in files if os.path.isfile(os.path.join(dir, f))]
def copy_folder(in_folder, out_folder):
if not (os.path.isdir(out_folder)):
shutil.copytree(in_folder, out_folder, ignore=ig_f)


def ig_f(dir, files):
return [f for f in files if os.path.isfile(os.path.join(dir, f))]


in_folder=sys.argv[1]
out_folder=sys.argv[2]
list_file=sys.argv[3]
in_folder = sys.argv[1]
out_folder = sys.argv[2]
list_file = sys.argv[3]

# Read List file
list_sig=ReadList(list_file)
list_sig = ReadList(list_file)

# Replicate input folder structure to output folder
copy_folder(in_folder,out_folder)

copy_folder(in_folder, out_folder)

# Speech Data Reverberation Loop
for i in range(len(list_sig)):

# Open the wav file
wav_file=in_folder+'/'+list_sig[i]
[signal, fs] = sf.read(wav_file)
signal=signal.astype(np.float64)

# Signal normalization
signal=signal/np.max(np.abs(signal))

# Read wrd file
wrd_file=wav_file.replace(".wav",".wrd")
wrd_sig=ReadList(wrd_file)
beg_sig=int(wrd_sig[0].split(' ')[0])
end_sig=int(wrd_sig[-1].split(' ')[1])

# Remove silences
signal=signal[beg_sig:end_sig]


# Save normalized speech
file_out=out_folder+'/'+list_sig[i]

sf.write(file_out, signal, fs)

print("Done %s" % (file_out))
for i in range(len(list_sig)):
# Open the wav file
wav_file = in_folder + '/' + list_sig[i]
[signal, fs] = sf.read(wav_file)
signal = signal.astype(np.float64)

# Signal normalization
signal = signal / np.max(np.abs(signal))

# Read wrd file
wrd_file = wav_file.replace(".wav", ".wrd")
wrd_sig = ReadList(wrd_file)
beg_sig = int(wrd_sig[0].split(' ')[0])
end_sig = int(wrd_sig[-1].split(' ')[1])

# Remove silences
signal = signal[beg_sig:end_sig]

# Save normalized speech
file_out = out_folder + '/' + list_sig[i]

sf.write(file_out, signal, fs)

print("Done %s" % (file_out))
Empty file added __init__.py
Empty file.
Loading