mravanelli · qwfy · Oct 5, 2019 · Oct 5, 2019 · Oct 5, 2019 · Oct 5, 2019
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+.idea/
+*.pyc
+__pycache__/
+.ipynb_checkpoints/
diff --git a/README.md b/README.md
@@ -89,10 +89,10 @@ epoch 360, loss_tr=0.033095 err_tr=0.009600 loss_te=4.254683 err_te=0.419954 err
 The converge is initially very fast (see the first 30 epochs). After that the performance improvement decreases and oscillations into the sentence error rate performance appear. Despite these oscillations an average improvement trend can be observed for the subsequent epochs. In this experiment, we stopped our training  at epoch 360.
 The fields of the res.res file have the following meaning:
 - loss_tr: is the average training loss (i.e., cross-entropy function) computed at every frame.
-- err_tr: is the classification error (measured at frame level) of the training data. Note that we split the speech signals into chunks of 200ms with 10ms overlap. The error is averaged for all the chunks of the training dataset.
+- err_tr: is the classification error (measured at frame level) of the training data. Note that we split the speech signals into chunks of 200ms with 190ms overlap. The error is averaged for all the chunks of the training dataset.
 - loss_te is the average test loss (i.e., cross-entropy function) computed at every frame.
 - err_te: is the classification error (measured at frame level) of the test data.
-- err_te_snt: is the classification error (measured at sentence level) of the test data. Note that we split the speech signal into chunks of 200ms with 10ms overlap. For each chunk, our SincNet performs a prediction over the set of speakers. To compute this classification error rate we averaged the predictions and, for each sentence, we voted for the speaker with the highest average probability.
+- err_te_snt: is the classification error (measured at sentence level) of the test data. Note that we split the speech signal into chunks of 200ms with 190ms overlap. For each chunk, our SincNet performs a prediction over the set of speakers. To compute this classification error rate we averaged the predictions and, for each sentence, we voted for the speaker with the highest average probability.
 
 [You can find our trained model for TIMIT here.](https://bitbucket.org/mravanelli/sincnet_models/)
 

diff --git a/TIMIT_preparation.py b/TIMIT_preparation.py
@@ -1,17 +1,17 @@
 #!/usr/bin/env python3
 
-# TIMIT_preparation 
-# Mirco Ravanelli 
-# Mila - University of Montreal 
+# TIMIT_preparation
+# Mirco Ravanelli
+# Mila - University of Montreal
 
 # July 2018
 
-# Description: 
-# This code prepares TIMIT for the following speaker identification experiments. 
+# Description:
+# This code prepares TIMIT for the following speaker identification experiments.
 # It removes start and end silences according to the information reported in the *.wrd files and normalizes the amplitude of each sentence.
- 
+
 # How to run it:
-# python TIMIT_preparation.py $TIMIT_FOLDER $OUTPUT_FOLDER data_lists/TIMIT_all.scp 
+# python TIMIT_preparation.py $TIMIT_FOLDER $OUTPUT_FOLDER data_lists/TIMIT_all.scp
 
 # NOTE: This script expects filenames in lowercase (e.g, train/dr1/fcjf0/si1027.wav" rather than "TRAIN/DR1/FCJF0/SI1027.WAV)
 
@@ -22,59 +22,58 @@
 import numpy as np
 import sys
 
+
 def ReadList(list_file):
- f=open(list_file,"r")
- lines=f.readlines()
- list_sig=[]
- for x in lines:
+  f = open(list_file, "r")
+  lines = f.readlines()
+  list_sig = []
+  for x in lines:
     list_sig.append(x.rstrip())
- f.close()
- return list_sig
+  f.close()
+  return list_sig
 
-def copy_folder(in_folder,out_folder):
- if not(os.path.isdir(out_folder)):
-  shutil.copytree(in_folder, out_folder, ignore=ig_f)
 
-def ig_f(dir, files):
- return [f for f in files if os.path.isfile(os.path.join(dir, f))]
+def copy_folder(in_folder, out_folder):
+  if not (os.path.isdir(out_folder)):
+    shutil.copytree(in_folder, out_folder, ignore=ig_f)
 
 
+def ig_f(dir, files):
+  return [f for f in files if os.path.isfile(os.path.join(dir, f))]
+
 
-in_folder=sys.argv[1]
-out_folder=sys.argv[2]
-list_file=sys.argv[3]
+in_folder = sys.argv[1]
+out_folder = sys.argv[2]
+list_file = sys.argv[3]
 
 # Read List file
-list_sig=ReadList(list_file)
+list_sig = ReadList(list_file)
 
 # Replicate input folder structure to output folder
-copy_folder(in_folder,out_folder)
-
+copy_folder(in_folder, out_folder)
 
 # Speech Data Reverberation Loop
-for i in range(len(list_sig)): 
-
- # Open the wav file
- wav_file=in_folder+'/'+list_sig[i]
- [signal, fs] = sf.read(wav_file)
- signal=signal.astype(np.float64)
-
- # Signal normalization
- signal=signal/np.max(np.abs(signal))
-
- # Read wrd file
- wrd_file=wav_file.replace(".wav",".wrd")
- wrd_sig=ReadList(wrd_file)
- beg_sig=int(wrd_sig[0].split(' ')[0])
- end_sig=int(wrd_sig[-1].split(' ')[1])
-
- # Remove silences
- signal=signal[beg_sig:end_sig]
-
-
- # Save normalized speech
- file_out=out_folder+'/'+list_sig[i]
-
- sf.write(file_out, signal, fs)
-
- print("Done %s" % (file_out))
+for i in range(len(list_sig)):
+  # Open the wav file
+  wav_file = in_folder + '/' + list_sig[i]
+  [signal, fs] = sf.read(wav_file)
+  signal = signal.astype(np.float64)
+
+  # Signal normalization
+  signal = signal / np.max(np.abs(signal))
+
+  # Read wrd file
+  wrd_file = wav_file.replace(".wav", ".wrd")
+  wrd_sig = ReadList(wrd_file)
+  beg_sig = int(wrd_sig[0].split(' ')[0])
+  end_sig = int(wrd_sig[-1].split(' ')[1])
+
+  # Remove silences
+  signal = signal[beg_sig:end_sig]
+
+  # Save normalized speech
+  file_out = out_folder + '/' + list_sig[i]
+
+  sf.write(file_out, signal, fs)
+
+  print("Done %s" % (file_out))
diff --git a/__init__.py b/__init__.py