diff --git a/README.md b/README.md
index 454fb79c7..aae9b6ac0 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 # Real-Time Voice Cloning
 This repository is an implementation of [Transfer Learning from Speaker Verification to
-Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. Feel free to check [my thesis](https://matheo.uliege.be/handle/2268.2/6801) if you're curious or if you're looking for info I haven't documented. Mostly I would recommend giving a quick look to the figures beyond the introduction.
+Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
 
-SV2TTS is a three-stage deep learning framework that allows to create a numerical representation of a voice from a few seconds of audio, and to use it to condition a text-to-speech model trained to generalize to new voices.
+SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text.
 
 **Video demonstration** (click the picture):
 
@@ -19,31 +19,28 @@ SV2TTS is a three-stage deep learning framework that allows to create a numerica
 |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
 
 ## News
+**28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below.
+
 **14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish. If you wish to run the tensorflow version instead, checkout commit `5425557`.
 
 **13/11/19**: I'm now working full time and I will not maintain this repo anymore. To anyone who reads this:
 - **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors.
 - **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info.
 
-**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder. You can use your trained encoder models from this repo with it.
-
-**06/07/19:** Need to run within a docker container on a remote server? See [here](https://sean.lane.sh/posts/2019/07/Running-the-Real-Time-Voice-Cloning-project-in-Docker/).
-
-**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM.
+**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it.
 
 
 ## Setup
 
-### 1. Install Requirements
-
-**Python 3.6 or 3.7** is needed to run the toolbox.
-
-* Install [PyTorch](https://pytorch.org/get-started/locally/) (>=1.1.0).
-* Install [ffmpeg](https://ffmpeg.org/download.html#get-packages).
-* Run `pip install -r requirements.txt` to install the remaining necessary packages.
+#### 1. Install Requirements
+1. Both Windows and Linux and supported. A GPU is recommended for training and for inference speed, but is not mandatory.
+2. Python 3.7 or greater is recommended. Python 3.6 should work, python 3.5 might need some tweaking. I recommend setting up a virtual environment using `venv`, but this is optional.
+3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
+4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
+5. Install the remaining requirements with `pip install -r requirements.txt`
 
-### 2. Download Pretrained Models
-Download the latest [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
+### 2. (Optional) Download Pretrained Models
+Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
 
 ### 3. (Optional) Test Configuration
 Before you download any dataset, you can begin by testing your configuration with:
diff --git a/demo_cli.py b/demo_cli.py
index d43f04d72..c6b3a0f2b 100644
--- a/demo_cli.py
+++ b/demo_cli.py
@@ -1,32 +1,33 @@
-from encoder.params_model import model_embedding_size as speaker_embedding_size
-from utils.argutils import print_args
-from utils.modelutils import check_model_paths
-from synthesizer.inference import Synthesizer
-from encoder import inference as encoder
-from vocoder import inference as vocoder
+import argparse
+import os
 from pathlib import Path
+
+import librosa
 import numpy as np
 import soundfile as sf
-import librosa
-import argparse
 import torch
-import sys
-import os
 from audioread.exceptions import NoBackendError
 
+from encoder import inference as encoder
+from encoder.params_model import model_embedding_size as speaker_embedding_size
+from synthesizer.inference import Synthesizer
+from utils.argutils import print_args
+from utils.default_models import ensure_default_models
+from vocoder import inference as vocoder
+
+
 if __name__ == '__main__':
-    ## Info & args
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    parser.add_argument("-e", "--enc_model_fpath", type=Path, 
-                        default="encoder/saved_models/pretrained.pt",
+    parser.add_argument("-e", "--enc_model_fpath", type=Path,
+                        default="saved_models/default/encoder.pt",
                         help="Path to a saved encoder")
-    parser.add_argument("-s", "--syn_model_fpath", type=Path, 
-                        default="synthesizer/saved_models/pretrained/pretrained.pt",
+    parser.add_argument("-s", "--syn_model_fpath", type=Path,
+                        default="saved_models/default/synthesizer.pt",
                         help="Path to a saved synthesizer")
-    parser.add_argument("-v", "--voc_model_fpath", type=Path, 
-                        default="vocoder/saved_models/pretrained/pretrained.pt",
+    parser.add_argument("-v", "--voc_model_fpath", type=Path,
+                        default="saved_models/default/vocoder.pt",
                         help="Path to a saved vocoder")
     parser.add_argument("--cpu", action="store_true", help=\
         "If True, processing is done on CPU, even when a GPU is available.")
@@ -34,33 +35,22 @@
         "If True, audio won't be played.")
     parser.add_argument("--seed", type=int, default=None, help=\
         "Optional random number seed value to make toolbox deterministic.")
-    parser.add_argument("--no_mp3_support", action="store_true", help=\
-        "If True, disallows loading mp3 files to prevent audioread errors when ffmpeg is not installed.")
     args = parser.parse_args()
+    arg_dict = vars(args)
     print_args(args, parser)
-    if not args.no_sound:
-        import sounddevice as sd
 
-    if args.cpu:
-        # Hide GPUs from Pytorch to force CPU processing
+    # Hide GPUs from Pytorch to force CPU processing
+    if arg_dict.pop("cpu"):
         os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
-    if not args.no_mp3_support:
-        try:
-            librosa.load("samples/1320_00000.mp3")
-        except NoBackendError:
-            print("Librosa will be unable to open mp3 files if additional software is not installed.\n"
-                  "Please install ffmpeg or add the '--no_mp3_support' option to proceed without support for mp3 files.")
-            exit(-1)
-        
     print("Running a test of your configuration...\n")
-        
+
     if torch.cuda.is_available():
         device_id = torch.cuda.current_device()
         gpu_properties = torch.cuda.get_device_properties(device_id)
         ## Print some environment information (for debugging purposes)
         print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
-            "%.1fGb total memory.\n" % 
+            "%.1fGb total memory.\n" %
             (torch.cuda.device_count(),
             device_id,
             gpu_properties.name,
@@ -69,68 +59,64 @@
             gpu_properties.total_memory / 1e9))
     else:
         print("Using CPU for inference.\n")
-    
-    ## Remind the user to download pretrained models if needed
-    check_model_paths(encoder_path=args.enc_model_fpath,
-                      synthesizer_path=args.syn_model_fpath,
-                      vocoder_path=args.voc_model_fpath)
-    
+
     ## Load the models one by one.
     print("Preparing the encoder, the synthesizer and the vocoder...")
+    ensure_default_models(Path("saved_models"))
     encoder.load_model(args.enc_model_fpath)
     synthesizer = Synthesizer(args.syn_model_fpath)
     vocoder.load_model(args.voc_model_fpath)
-    
-    
+
+
     ## Run a test
     print("Testing your configuration with small inputs.")
     # Forward an audio waveform of zeroes that lasts 1 second. Notice how we can get the encoder's
     # sampling rate, which may differ.
-    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats 
+    # If you're unfamiliar with digital audio, know that it is encoded as an array of floats
     # (or sometimes integers, but mostly floats in this projects) ranging from -1 to 1.
     # The sampling rate is the number of values (samples) recorded per second, it is set to
-    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond 
+    # 16000 for the encoder. Creating an array of length <sampling_rate> will always correspond
     # to an audio of 1 second.
     print("\tTesting the encoder...")
     encoder.embed_utterance(np.zeros(encoder.sampling_rate))
-    
+
     # Create a dummy embedding. You would normally use the embedding that encoder.embed_utterance
     # returns, but here we're going to make one ourselves just for the sake of showing that it's
     # possible.
     embed = np.random.rand(speaker_embedding_size)
-    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own 
+    # Embeddings are L2-normalized (this isn't important here, but if you want to make your own
     # embeddings it will be).
     embed /= np.linalg.norm(embed)
-    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to 
+    # The synthesizer can handle multiple inputs with batching. Let's create another embedding to
     # illustrate that
     embeds = [embed, np.zeros(speaker_embedding_size)]
     texts = ["test 1", "test 2"]
     print("\tTesting the synthesizer... (loading the model will output a lot of text)")
     mels = synthesizer.synthesize_spectrograms(texts, embeds)
-    
-    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 
+
+    # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
     # can concatenate the mel spectrograms to a single one.
     mel = np.concatenate(mels, axis=1)
-    # The vocoder can take a callback function to display the generation. More on that later. For 
+    # The vocoder can take a callback function to display the generation. More on that later. For
     # now we'll simply hide it like this:
     no_action = lambda *args: None
     print("\tTesting the vocoder...")
-    # For the sake of making this test short, we'll pass a short target length. The target length 
-    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled 
+    # For the sake of making this test short, we'll pass a short target length. The target length
+    # is the length of the wav segments that are processed in parallel. E.g. for audio sampled
     # at 16000 Hertz, a target length of 8000 means that the target audio will be cut in chunks of
-    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and 
-    # that has a detrimental effect on the quality of the audio. The default parameters are 
+    # 0.5 seconds which will all be generated together. The parameters here are absurdly short, and
+    # that has a detrimental effect on the quality of the audio. The default parameters are
     # recommended in general.
     vocoder.infer_waveform(mel, target=200, overlap=50, progress_callback=no_action)
-    
+
     print("All test passed! You can now synthesize speech.\n\n")
-    
-    
+
+
     ## Interactive speech generation
     print("This is a GUI-less example of interface to SV2TTS. The purpose of this script is to "
           "show how you can interface this project easily with your own. See the source code for "
           "an explanation of what is happening.\n")
-    
+
     print("Interactive generation loop")
     num_generated = 0
     while True:
@@ -140,13 +126,10 @@
                       "wav, m4a, flac, ...):\n"
             in_fpath = Path(input(message).replace("\"", "").replace("\'", ""))
 
-            if in_fpath.suffix.lower() == ".mp3" and args.no_mp3_support:
-                print("Can't Use mp3 files please try again:")
-                continue
             ## Computing the embedding
-            # First, we load the wav using the function that the speaker encoder provides. This is 
+            # First, we load the wav using the function that the speaker encoder provides. This is
             # important: there is preprocessing that must be applied.
-            
+
             # The following two methods are equivalent:
             # - Directly load from the filepath:
             preprocessed_wav = encoder.preprocess_wav(in_fpath)
@@ -154,17 +137,17 @@
             original_wav, sampling_rate = librosa.load(str(in_fpath))
             preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
             print("Loaded file succesfully")
-            
-            # Then we derive the embedding. There are many functions and parameters that the 
+
+            # Then we derive the embedding. There are many functions and parameters that the
             # speaker encoder interfaces. These are mostly for in-depth research. You will typically
             # only use this function (with its default parameters):
             embed = encoder.embed_utterance(preprocessed_wav)
             print("Created the embedding")
-            
-            
+
+
             ## Generating the spectrogram
             text = input("Write a sentence (+-20 words) to be synthesized:\n")
-            
+
             # If seed is specified, reset torch seed and force synthesizer reload
             if args.seed is not None:
                 torch.manual_seed(args.seed)
@@ -178,8 +161,8 @@
             specs = synthesizer.synthesize_spectrograms(texts, embeds)
             spec = specs[0]
             print("Created the mel spectrogram")
-            
-            
+
+
             ## Generating the waveform
             print("Synthesizing the waveform:")
 
@@ -191,8 +174,8 @@
             # Synthesizing the waveform is fairly straightforward. Remember that the longer the
             # spectrogram, the more time-efficient the vocoder.
             generated_wav = vocoder.infer_waveform(spec)
-            
-            
+
+
             ## Post-generation
             # There's a bug with sounddevice that makes the audio cut one second earlier, so we
             # pad it.
@@ -200,9 +183,10 @@
 
             # Trim excess silences to compensate for gaps in spectrograms (issue #53)
             generated_wav = encoder.preprocess_wav(generated_wav)
-            
+
             # Play the audio (non-blocking)
             if not args.no_sound:
+                import sounddevice as sd
                 try:
                     sd.stop()
                     sd.play(generated_wav, synthesizer.sample_rate)
@@ -211,15 +195,15 @@
                     print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
                 except:
                     raise
-                
+
             # Save it on the disk
             filename = "demo_output_%02d.wav" % num_generated
             print(generated_wav.dtype)
             sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
             num_generated += 1
             print("\nSaved output as %s\n\n" % filename)
-            
-            
+
+
         except Exception as e:
             print("Caught exception: %s" % repr(e))
             print("Restarting\n")
diff --git a/demo_toolbox.py b/demo_toolbox.py
index ea30a2927..e69eb6261 100644
--- a/demo_toolbox.py
+++ b/demo_toolbox.py
@@ -1,43 +1,37 @@
+import argparse
+import os
 from pathlib import Path
+
 from toolbox import Toolbox
 from utils.argutils import print_args
-from utils.modelutils import check_model_paths
-import argparse
-import os
+from utils.default_models import ensure_default_models
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
-        description="Runs the toolbox",
+        description="Runs the toolbox.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    
+
     parser.add_argument("-d", "--datasets_root", type=Path, help= \
         "Path to the directory containing your datasets. See toolbox/__init__.py for a list of "
         "supported datasets.", default=None)
-    parser.add_argument("-e", "--enc_models_dir", type=Path, default="encoder/saved_models", 
-                        help="Directory containing saved encoder models")
-    parser.add_argument("-s", "--syn_models_dir", type=Path, default="synthesizer/saved_models", 
-                        help="Directory containing saved synthesizer models")
-    parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models", 
-                        help="Directory containing saved vocoder models")
+    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models",
+                        help="Directory containing all saved models")
     parser.add_argument("--cpu", action="store_true", help=\
-        "If True, processing is done on CPU, even when a GPU is available.")
+        "If True, all inference will be done on CPU")
     parser.add_argument("--seed", type=int, default=None, help=\
         "Optional random number seed value to make toolbox deterministic.")
-    parser.add_argument("--no_mp3_support", action="store_true", help=\
-        "If True, no mp3 files are allowed.")
     args = parser.parse_args()
+    arg_dict = vars(args)
     print_args(args, parser)
 
-    if args.cpu:
-        # Hide GPUs from Pytorch to force CPU processing
+    # Hide GPUs from Pytorch to force CPU processing
+    if arg_dict.pop("cpu"):
         os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-    del args.cpu
 
-    ## Remind the user to download pretrained models if needed
-    check_model_paths(encoder_path=args.enc_models_dir, synthesizer_path=args.syn_models_dir,
-                      vocoder_path=args.voc_models_dir)
+    # Remind the user to download pretrained models if needed
+    ensure_default_models(args.models_dir)
 
     # Launch the toolbox
-    Toolbox(**vars(args))    
+    Toolbox(**arg_dict)
diff --git a/encoder/data_objects/speaker_batch.py b/encoder/data_objects/speaker_batch.py
index 56651dba5..e219b738e 100644
--- a/encoder/data_objects/speaker_batch.py
+++ b/encoder/data_objects/speaker_batch.py
@@ -2,11 +2,12 @@
 from typing import List
 from encoder.data_objects.speaker import Speaker
 
+
 class SpeakerBatch:
     def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
         self.speakers = speakers
         self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
-        
+
         # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
         # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
         self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
diff --git a/encoder/inference.py b/encoder/inference.py
index 4ca417b63..43862e43e 100644
--- a/encoder/inference.py
+++ b/encoder/inference.py
@@ -4,7 +4,6 @@
 from matplotlib import cm
 from encoder import audio
 from pathlib import Path
-import matplotlib.pyplot as plt
 import numpy as np
 import torch
 
@@ -14,12 +13,12 @@
 
 def load_model(weights_fpath: Path, device=None):
     """
-    Loads the model in memory. If this function is not explicitely called, it will be run on the 
+    Loads the model in memory. If this function is not explicitely called, it will be run on the
     first call to embed_frames() with the default weights file.
-    
+
     :param weights_fpath: the path to saved model weights.
-    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The 
-    model will be loaded and will run on this device. Outputs will however always be on the cpu. 
+    :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
+    model will be loaded and will run on this device. Outputs will however always be on the cpu.
     If None, will default to your GPU if it"s available, otherwise your CPU.
     """
     # TODO: I think the slow loading of the encoder might have something to do with the device it
@@ -34,8 +33,8 @@ def load_model(weights_fpath: Path, device=None):
     _model.load_state_dict(checkpoint["model_state"])
     _model.eval()
     print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
-    
-    
+
+
 def is_loaded():
     return _model is not None
 
@@ -43,14 +42,14 @@ def is_loaded():
 def embed_frames_batch(frames_batch):
     """
     Computes embeddings for a batch of mel spectrogram.
-    
-    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape 
+
+    :param frames_batch: a batch mel of spectrogram as a numpy array of float32 of shape
     (batch_size, n_frames, n_channels)
     :return: the embeddings as a numpy array of float32 of shape (batch_size, model_embedding_size)
     """
     if _model is None:
         raise Exception("Model was not loaded. Call load_model() before inference.")
-    
+
     frames = torch.from_numpy(frames_batch).to(_device)
     embed = _model.forward(frames).detach().cpu().numpy()
     return embed
@@ -59,32 +58,32 @@ def embed_frames_batch(frames_batch):
 def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
                            min_pad_coverage=0.75, overlap=0.5):
     """
-    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain 
-    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel 
-    spectrogram slices are returned, so as to make each partial utterance waveform correspond to 
-    its spectrogram. This function assumes that the mel spectrogram parameters used are those 
+    Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
+    partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
+    spectrogram slices are returned, so as to make each partial utterance waveform correspond to
+    its spectrogram. This function assumes that the mel spectrogram parameters used are those
     defined in params_data.py.
-    
-    The returned ranges may be indexing further than the length of the waveform. It is 
+
+    The returned ranges may be indexing further than the length of the waveform. It is
     recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
-    
+
     :param n_samples: the number of samples in the waveform
-    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial 
+    :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
     utterance
-    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have 
-    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present, 
-    then the last partial utterance will be considered, as if we padded the audio. Otherwise, 
-    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial 
+    :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
+    enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
+    then the last partial utterance will be considered, as if we padded the audio. Otherwise,
+    it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
     utterance, this parameter is ignored so that the function always returns at least 1 slice.
-    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial 
-    utterances are entirely disjoint. 
-    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index 
-    respectively the waveform and the mel spectrogram with these slices to obtain the partial 
+    :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
+    utterances are entirely disjoint.
+    :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
+    respectively the waveform and the mel spectrogram with these slices to obtain the partial
     utterances.
     """
     assert 0 <= overlap < 1
     assert 0 < min_pad_coverage <= 1
-    
+
     samples_per_frame = int((sampling_rate * mel_window_step / 1000))
     n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
     frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
@@ -97,34 +96,34 @@ def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_fram
         wav_range = mel_range * samples_per_frame
         mel_slices.append(slice(*mel_range))
         wav_slices.append(slice(*wav_range))
-        
+
     # Evaluate whether extra padding is warranted or not
     last_wav_range = wav_slices[-1]
     coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
     if coverage < min_pad_coverage and len(mel_slices) > 1:
         mel_slices = mel_slices[:-1]
         wav_slices = wav_slices[:-1]
-    
+
     return wav_slices, mel_slices
 
 
 def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
     """
     Computes an embedding for a single utterance.
-    
+
     # TODO: handle multiple wavs to benefit from batching on GPU
     :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
-    :param using_partials: if True, then the utterance is split in partial utterances of 
-    <partial_utterance_n_frames> frames and the utterance embedding is computed from their 
-    normalized average. If False, the utterance is instead computed from feeding the entire 
+    :param using_partials: if True, then the utterance is split in partial utterances of
+    <partial_utterance_n_frames> frames and the utterance embedding is computed from their
+    normalized average. If False, the utterance is instead computed from feeding the entire
     spectogram to the network.
-    :param return_partials: if True, the partial embeddings will also be returned along with the 
+    :param return_partials: if True, the partial embeddings will also be returned along with the
     wav slices that correspond to the partial embeddings.
     :param kwargs: additional arguments to compute_partial_splits()
-    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If 
-    <return_partials> is True, the partial utterances as a numpy array of float32 of shape 
-    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be 
-    returned. If <using_partials> is simultaneously set to False, both these values will be None 
+    :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
+    <return_partials> is True, the partial utterances as a numpy array of float32 of shape
+    (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
+    returned. If <using_partials> is simultaneously set to False, both these values will be None
     instead.
     """
     # Process the entire utterance if not using partials
@@ -134,22 +133,22 @@ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
         if return_partials:
             return embed, None, None
         return embed
-    
+
     # Compute where to split the utterance into partials and pad if necessary
     wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
     max_wave_length = wave_slices[-1].stop
     if max_wave_length >= len(wav):
         wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
-    
+
     # Split the utterance into partials
     frames = audio.wav_to_mel_spectrogram(wav)
     frames_batch = np.array([frames[s] for s in mel_slices])
     partial_embeds = embed_frames_batch(frames_batch)
-    
+
     # Compute the utterance embedding from the partial embeddings
     raw_embed = np.mean(partial_embeds, axis=0)
     embed = raw_embed / np.linalg.norm(raw_embed, 2)
-    
+
     if return_partials:
         return embed, partial_embeds, wave_slices
     return embed
@@ -160,19 +159,20 @@ def embed_speaker(wavs, **kwargs):
 
 
 def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
+    import matplotlib.pyplot as plt
     if ax is None:
         ax = plt.gca()
-    
+
     if shape is None:
         height = int(np.sqrt(len(embed)))
         shape = (height, -1)
     embed = embed.reshape(shape)
-    
+
     cmap = cm.get_cmap()
     mappable = ax.imshow(embed, cmap=cmap)
     cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
     sm = cm.ScalarMappable(cmap=cmap)
     sm.set_clim(*color_range)
-    
+
     ax.set_xticks([]), ax.set_yticks([])
     ax.set_title(title)
diff --git a/encoder/preprocess.py b/encoder/preprocess.py
index 551a8b29c..d2dfc5ecf 100644
--- a/encoder/preprocess.py
+++ b/encoder/preprocess.py
@@ -1,12 +1,17 @@
-from multiprocess.pool import ThreadPool
-from encoder.params_data import *
-from encoder.config import librispeech_datasets, anglophone_nationalites
 from datetime import datetime
-from encoder import audio
+from functools import partial
+from multiprocessing import Pool
 from pathlib import Path
-from tqdm import tqdm
+
 import numpy as np
+from tqdm import tqdm
+
+from encoder import audio
+from encoder.config import librispeech_datasets, anglophone_nationalites
+from encoder.params_data import *
+
 
+_AUDIO_EXTENSIONS = ("wav", "flac", "m4a", "mp3")
 
 class DatasetLog:
     """
@@ -15,12 +20,12 @@ class DatasetLog:
     def __init__(self, root, name):
         self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
         self.sample_data = dict()
-        
+
         start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
         self.write_line("Creating dataset %s on %s" % (name, start_time))
         self.write_line("-----")
         self._log_params()
-        
+
     def _log_params(self):
         from encoder import params_data
         self.write_line("Parameter values:")
@@ -28,16 +33,16 @@ def _log_params(self):
             value = getattr(params_data, param_name)
             self.write_line("\t%s: %s" % (param_name, value))
         self.write_line("-----")
-    
+
     def write_line(self, line):
         self.text_file.write("%s\n" % line)
-        
+
     def add_sample(self, **kwargs):
         for param_name, value in kwargs.items():
             if not param_name in self.sample_data:
                 self.sample_data[param_name] = []
             self.sample_data[param_name].append(value)
-            
+
     def finalize(self):
         self.write_line("Statistics:")
         for param_name, values in self.sample_data.items():
@@ -48,8 +53,8 @@ def finalize(self):
         end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
         self.write_line("Finished on %s" % end_time)
         self.text_file.close()
-       
-        
+
+
 def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
     dataset_root = datasets_root.joinpath(dataset_name)
     if not dataset_root.exists():
@@ -58,62 +63,69 @@ def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, Dat
     return dataset_root, DatasetLog(out_dir, dataset_name)
 
 
-def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
-                             skip_existing, logger):
-    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
-    
-    # Function to preprocess utterances for one speaker
-    def preprocess_speaker(speaker_dir: Path):
-        # Give a name to the speaker that includes its dataset
-        speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
-        
-        # Create an output directory with that name, as well as a txt file containing a 
-        # reference to each source file.
-        speaker_out_dir = out_dir.joinpath(speaker_name)
-        speaker_out_dir.mkdir(exist_ok=True)
-        sources_fpath = speaker_out_dir.joinpath("_sources.txt")
-        
-        # There's a possibility that the preprocessing was interrupted earlier, check if 
-        # there already is a sources file.
-        if sources_fpath.exists():
-            try:
-                with sources_fpath.open("r") as sources_file:
-                    existing_fnames = {line.split(",")[0] for line in sources_file}
-            except:
-                existing_fnames = {}
-        else:
+def _preprocess_speaker(speaker_dir: Path, datasets_root: Path, out_dir: Path, skip_existing: bool):
+    # Give a name to the speaker that includes its dataset
+    speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
+
+    # Create an output directory with that name, as well as a txt file containing a
+    # reference to each source file.
+    speaker_out_dir = out_dir.joinpath(speaker_name)
+    speaker_out_dir.mkdir(exist_ok=True)
+    sources_fpath = speaker_out_dir.joinpath("_sources.txt")
+
+    # There's a possibility that the preprocessing was interrupted earlier, check if
+    # there already is a sources file.
+    if sources_fpath.exists():
+        try:
+            with sources_fpath.open("r") as sources_file:
+                existing_fnames = {line.split(",")[0] for line in sources_file}
+        except:
             existing_fnames = {}
-        
-        # Gather all audio files for that speaker recursively
-        sources_file = sources_fpath.open("a" if skip_existing else "w")
+    else:
+        existing_fnames = {}
+
+    # Gather all audio files for that speaker recursively
+    sources_file = sources_fpath.open("a" if skip_existing else "w")
+    audio_durs = []
+    for extension in _AUDIO_EXTENSIONS:
         for in_fpath in speaker_dir.glob("**/*.%s" % extension):
             # Check if the target output file already exists
             out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
             out_fname = out_fname.replace(".%s" % extension, ".npy")
             if skip_existing and out_fname in existing_fnames:
                 continue
-                
+
             # Load and preprocess the waveform
             wav = audio.preprocess_wav(in_fpath)
             if len(wav) == 0:
                 continue
-            
+
             # Create the mel spectrogram, discard those that are too short
             frames = audio.wav_to_mel_spectrogram(wav)
             if len(frames) < partials_n_frames:
                 continue
-            
+
             out_fpath = speaker_out_dir.joinpath(out_fname)
             np.save(out_fpath, frames)
-            logger.add_sample(duration=len(wav) / sampling_rate)
             sources_file.write("%s,%s\n" % (out_fname, in_fpath))
-        
-        sources_file.close()
-    
+            audio_durs.append(len(wav) / sampling_rate)
+
+    sources_file.close()
+
+    return audio_durs
+
+
+def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger):
+    print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
+
     # Process the utterances for each speaker
-    with ThreadPool(8) as pool:
-        list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
-                  unit="speakers"))
+    work_fn = partial(_preprocess_speaker, datasets_root=datasets_root, out_dir=out_dir, skip_existing=skip_existing)
+    with Pool(4) as pool:
+        tasks = pool.imap(work_fn, speaker_dirs)
+        for sample_durs in tqdm(tasks, dataset_name, len(speaker_dirs), unit="speakers"):
+            for sample_dur in sample_durs:
+                logger.add_sample(duration=sample_dur)
+
     logger.finalize()
     print("Done preprocessing %s.\n" % dataset_name)
 
@@ -123,12 +135,11 @@ def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=Fal
         # Initialize the preprocessing
         dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
         if not dataset_root:
-            return 
-        
+            return
+
         # Preprocess all speakers
         speaker_dirs = list(dataset_root.glob("*"))
-        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
-                                 skip_existing, logger)
+        _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
 
 
 def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
@@ -141,24 +152,23 @@ def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False
     # Get the contents of the meta file
     with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
         metadata = [line.split("\t") for line in metafile][1:]
-    
+
     # Select the ID and the nationality, filter out non-anglophone speakers
     nationalities = {line[0]: line[3] for line in metadata}
-    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if 
+    keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
                         nationality.lower() in anglophone_nationalites]
-    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." % 
+    print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
           (len(keep_speaker_ids), len(nationalities)))
-    
+
     # Get the speaker directories for anglophone speakers only
     speaker_dirs = dataset_root.joinpath("wav").glob("*")
     speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
                     speaker_dir.name in keep_speaker_ids]
-    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." % 
+    print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
           (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
 
     # Preprocess all speakers
-    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
-                             skip_existing, logger)
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
 
 
 def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
@@ -167,9 +177,8 @@ def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False
     dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
     if not dataset_root:
         return
-    
+
     # Get the speaker directories
     # Preprocess all speakers
     speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
-    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
-                             skip_existing, logger)
+    _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, skip_existing, logger)
diff --git a/encoder/train.py b/encoder/train.py
index 619952e8d..2bed4eb2f 100644
--- a/encoder/train.py
+++ b/encoder/train.py
@@ -1,16 +1,19 @@
-from encoder.visualizations import Visualizations
+from pathlib import Path
+
+import torch
+
 from encoder.data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
-from encoder.params_model import *
 from encoder.model import SpeakerEncoder
+from encoder.params_model import *
+from encoder.visualizations import Visualizations
 from utils.profiler import Profiler
-from pathlib import Path
-import torch
+
 
 def sync(device: torch.device):
     # For correct profiling (cuda operations are async)
     if device.type == "cuda":
         torch.cuda.synchronize(device)
-    
+
 
 def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
           backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
@@ -21,24 +24,25 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
         dataset,
         speakers_per_batch,
         utterances_per_speaker,
-        num_workers=8,
+        num_workers=4,
     )
-    
-    # Setup the device on which to run the forward pass and the loss. These can be different, 
+
+    # Setup the device on which to run the forward pass and the loss. These can be different,
     # because the forward pass is faster on the GPU whereas the loss is often (depending on your
     # hyperparameters) faster on the CPU.
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # FIXME: currently, the gradient is None if loss_device is cuda
     loss_device = torch.device("cpu")
-    
+
     # Create the model and the optimizer
     model = SpeakerEncoder(device, loss_device)
     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
     init_step = 1
-    
+
     # Configure file path for the model
-    state_fpath = models_dir.joinpath(run_id + ".pt")
-    backup_dir = models_dir.joinpath(run_id + "_backups")
+    model_dir = models_dir / run_id
+    model_dir.mkdir(exist_ok=True, parents=True)
+    state_fpath = model_dir / "encoder.pt"
 
     # Load any existing model
     if not force_restart:
@@ -54,19 +58,19 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
     else:
         print("Starting the training from scratch.")
     model.train()
-    
+
     # Initialize the visualization environment
     vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
     vis.log_dataset(dataset)
     vis.log_params()
     device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
     vis.log_implementation({"Device": device_name})
-    
+
     # Training loop
     profiler = Profiler(summarize_every=10, disabled=False)
     for step, speaker_batch in enumerate(loader, init_step):
         profiler.tick("Blocking, waiting for batch (threaded)")
-        
+
         # Forward pass
         inputs = torch.from_numpy(speaker_batch.data).to(device)
         sync(device)
@@ -86,16 +90,15 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
         model.do_gradient_ops()
         optimizer.step()
         profiler.tick("Parameter update")
-        
+
         # Update visualizations
         # learning_rate = optimizer.param_groups[0]["lr"]
         vis.update(loss.item(), eer, step)
-        
+
         # Draw projections and save them to the backup folder
         if umap_every != 0 and step % umap_every == 0:
             print("Drawing and saving projections (step %d)" % step)
-            backup_dir.mkdir(exist_ok=True)
-            projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
+            projection_fpath = model_dir / f"umap_{step:06d}.png"
             embeds = embeds.detach().cpu().numpy()
             vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
             vis.save()
@@ -108,16 +111,15 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
                 "model_state": model.state_dict(),
                 "optimizer_state": optimizer.state_dict(),
             }, state_fpath)
-            
+
         # Make a backup
         if backup_every != 0 and step % backup_every == 0:
             print("Making a backup (step %d)" % step)
-            backup_dir.mkdir(exist_ok=True)
-            backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
+            backup_fpath = model_dir / f"encoder_{step:06d}.bak"
             torch.save({
                 "step": step + 1,
                 "model_state": model.state_dict(),
                 "optimizer_state": optimizer.state_dict(),
             }, backup_fpath)
-            
+
         profiler.tick("Extras (visualizations, saving)")
diff --git a/encoder/visualizations.py b/encoder/visualizations.py
index 980c74f95..d103944f2 100644
--- a/encoder/visualizations.py
+++ b/encoder/visualizations.py
@@ -1,11 +1,12 @@
-from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
 from datetime import datetime
 from time import perf_counter as timer
-import matplotlib.pyplot as plt
+
 import numpy as np
-# import webbrowser
-import visdom
 import umap
+import visdom
+
+from encoder.data_objects.speaker_verification_dataset import SpeakerVerificationDataset
+
 
 colormap = np.array([
     [76, 255, 0],
@@ -21,7 +22,7 @@
     [33, 0, 127],
     [0, 0, 0],
     [183, 183, 183],
-], dtype=np.float) / 255 
+], dtype=np.float) / 255
 
 
 class Visualizations:
@@ -33,19 +34,19 @@ def __init__(self, env_name=None, update_every=10, server="http://localhost", di
         self.losses = []
         self.eers = []
         print("Updating the visualizations every %d steps." % update_every)
-        
+
         # If visdom is disabled TODO: use a better paradigm for that
-        self.disabled = disabled    
+        self.disabled = disabled
         if self.disabled:
-            return 
-        
+            return
+
         # Set the environment name
         now = str(datetime.now().strftime("%d-%m %Hh%M"))
         if env_name is None:
             self.env_name = now
         else:
             self.env_name = "%s (%s)" % (env_name, now)
-        
+
         # Connect to visdom and open the corresponding window in the browser
         try:
             self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
@@ -53,7 +54,7 @@ def __init__(self, env_name=None, update_every=10, server="http://localhost", di
             raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
                             "start it.")
         # webbrowser.open("http://localhost:8097/env/" + self.env_name)
-        
+
         # Create the windows
         self.loss_win = None
         self.eer_win = None
@@ -61,10 +62,10 @@ def __init__(self, env_name=None, update_every=10, server="http://localhost", di
         self.implementation_win = None
         self.projection_win = None
         self.implementation_string = ""
-        
+
     def log_params(self):
         if self.disabled:
-            return 
+            return
         from encoder import params_data
         from encoder import params_model
         param_string = "<b>Model parameters</b>:<br>"
@@ -76,26 +77,26 @@ def log_params(self):
             value = getattr(params_data, param_name)
             param_string += "\t%s: %s<br>" % (param_name, value)
         self.vis.text(param_string, opts={"title": "Parameters"})
-        
+
     def log_dataset(self, dataset: SpeakerVerificationDataset):
         if self.disabled:
-            return 
+            return
         dataset_string = ""
         dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
         dataset_string += "\n" + dataset.get_logs()
         dataset_string = dataset_string.replace("\n", "<br>")
         self.vis.text(dataset_string, opts={"title": "Dataset"})
-        
+
     def log_implementation(self, params):
         if self.disabled:
-            return 
+            return
         implementation_string = ""
         for param, value in params.items():
             implementation_string += "<b>%s</b>: %s\n" % (param, value)
             implementation_string = implementation_string.replace("\n", "<br>")
         self.implementation_string = implementation_string
         self.implementation_win = self.vis.text(
-            implementation_string, 
+            implementation_string,
             opts={"title": "Training implementation"}
         )
 
@@ -107,7 +108,7 @@ def update(self, loss, eer, step):
         self.losses.append(loss)
         self.eers.append(eer)
         print(".", end="")
-        
+
         # Update the plots every <update_every> steps
         if step % self.update_every != 0:
             return
@@ -142,7 +143,7 @@ def update(self, loss, eer, step):
             )
             if self.implementation_win is not None:
                 self.vis.text(
-                    self.implementation_string + ("<b>%s</b>" % time_string), 
+                    self.implementation_string + ("<b>%s</b>" % time_string),
                     win=self.implementation_win,
                     opts={"title": "Training implementation"},
                 )
@@ -151,16 +152,17 @@ def update(self, loss, eer, step):
         self.losses.clear()
         self.eers.clear()
         self.step_times.clear()
-        
-    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
-                         max_speakers=10):
+
+    def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, max_speakers=10):
+        import matplotlib.pyplot as plt
+
         max_speakers = min(max_speakers, len(colormap))
         embeds = embeds[:max_speakers * utterances_per_speaker]
-        
+
         n_speakers = len(embeds) // utterances_per_speaker
         ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
         colors = [colormap[i] for i in ground_truth]
-        
+
         reducer = umap.UMAP()
         projected = reducer.fit_transform(embeds)
         plt.scatter(projected[:, 0], projected[:, 1], c=colors)
@@ -171,8 +173,7 @@ def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
         if out_fpath is not None:
             plt.savefig(out_fpath)
         plt.clf()
-        
+
     def save(self):
         if not self.disabled:
             self.vis.save([self.env_name])
-        
\ No newline at end of file
diff --git a/encoder_preprocess.py b/encoder_preprocess.py
index 11502013c..dac3e1455 100644
--- a/encoder_preprocess.py
+++ b/encoder_preprocess.py
@@ -3,10 +3,11 @@
 from pathlib import Path
 import argparse
 
+
 if __name__ == "__main__":
     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
         pass
-    
+
     parser = argparse.ArgumentParser(
         description="Preprocesses audio files from datasets, encodes them as mel spectrograms and "
                     "writes them to the disk. This will allow you to train the encoder. The "
@@ -28,7 +29,7 @@ class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptio
     parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
         "Path to the output directory that will contain the mel spectrograms. If left out, "
         "defaults to <datasets_root>/SV2TTS/encoder/")
-    parser.add_argument("-d", "--datasets", type=str, 
+    parser.add_argument("-d", "--datasets", type=str,
                         default="librispeech_other,voxceleb1,voxceleb2", help=\
         "Comma-separated list of the name of the datasets you want to preprocess. Only the train "
         "set of these datasets will be used. Possible names: librispeech_other, voxceleb1, "
diff --git a/encoder_train.py b/encoder_train.py
index b8740a894..7d70f636b 100644
--- a/encoder_train.py
+++ b/encoder_train.py
@@ -9,17 +9,18 @@
         description="Trains the speaker encoder. You must have run encoder_preprocess.py first.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    
+
     parser.add_argument("run_id", type=str, help= \
-        "Name for this model instance. If a model state from the same run ID was previously "
-        "saved, the training will restart from there. Pass -f to overwrite saved states and "
-        "restart from scratch.")
+        "Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
+        "from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
+        "states and restart from scratch.")
     parser.add_argument("clean_data_root", type=Path, help= \
         "Path to the output directory of encoder_preprocess.py. If you left the default "
         "output directory when preprocessing, it should be <datasets_root>/SV2TTS/encoder/.")
-    parser.add_argument("-m", "--models_dir", type=Path, default="encoder/saved_models/", help=\
-        "Path to the output directory that will contain the saved model weights, as well as "
-        "backups of those weights and plots generated during training.")
+    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", help=\
+        "Path to the root directory that contains all models. A directory <run_name> will be created under this root."
+        "It will contain the saved model weights, as well as backups of those weights and plots generated during "
+        "training.")
     parser.add_argument("-v", "--vis_every", type=int, default=10, help= \
         "Number of steps between updates of the loss and the plots.")
     parser.add_argument("-u", "--umap_every", type=int, default=100, help= \
@@ -37,11 +38,7 @@
     parser.add_argument("--no_visdom", action="store_true", help= \
         "Disable visdom.")
     args = parser.parse_args()
-    
-    # Process the arguments
-    args.models_dir.mkdir(exist_ok=True)
-    
+
     # Run the training
     print_args(args, parser)
     train(**vars(args))
-    
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index f4148d692..f0c24bfe2 100644
Binary files a/requirements.txt and b/requirements.txt differ
diff --git a/synthesizer/inference.py b/synthesizer/inference.py
index af7bf083f..340bb1fa1 100644
--- a/synthesizer/inference.py
+++ b/synthesizer/inference.py
@@ -14,17 +14,17 @@
 class Synthesizer:
     sample_rate = hparams.sample_rate
     hparams = hparams
-    
+
     def __init__(self, model_fpath: Path, verbose=True):
         """
         The model isn't instantiated and loaded in memory until needed or until load() is called.
-        
+
         :param model_fpath: path to the trained model file
         :param verbose: if False, prints less information when using the model
         """
         self.model_fpath = model_fpath
         self.verbose = verbose
- 
+
         # Check for GPU
         if torch.cuda.is_available():
             self.device = torch.device("cuda")
@@ -32,7 +32,7 @@ def __init__(self, model_fpath: Path, verbose=True):
             self.device = torch.device("cpu")
         if self.verbose:
             print("Synthesizer using device:", self.device)
-        
+
         # Tacotron model will be instantiated later on first use.
         self._model = None
 
@@ -41,7 +41,7 @@ def is_loaded(self):
         Whether the model is loaded in memory.
         """
         return self._model is not None
-    
+
     def load(self):
         """
         Instantiates and loads the model given the weights file that was passed in the constructor.
@@ -74,23 +74,17 @@ def synthesize_spectrograms(self, texts: List[str],
         Synthesizes mel spectrograms from texts and speaker embeddings.
 
         :param texts: a list of N text prompts to be synthesized
-        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
-        :param return_alignments: if True, a matrix representing the alignments between the 
+        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
+        :param return_alignments: if True, a matrix representing the alignments between the
         characters
         and each decoder output step will be returned for each spectrogram
-        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
+        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
         sequence length of spectrogram i, and possibly the alignments.
         """
         # Load the model on the first request.
         if not self.is_loaded():
             self.load()
 
-            # Print some info about the model when it is loaded            
-            tts_k = self._model.get_step() // 1000
-
-            simple_table([("Tacotron", str(tts_k) + "k"),
-                        ("r", self._model.r)])
-
         # Preprocess text inputs
         inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
         if not isinstance(embeddings, list):
@@ -137,7 +131,7 @@ def synthesize_spectrograms(self, texts: List[str],
     def load_preprocess_wav(fpath):
         """
         Loads and preprocesses an audio file under the same conditions the audio files were used to
-        train the synthesizer. 
+        train the synthesizer.
         """
         wav = librosa.load(str(fpath), hparams.sample_rate)[0]
         if hparams.rescale:
@@ -147,17 +141,17 @@ def load_preprocess_wav(fpath):
     @staticmethod
     def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
         """
-        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
+        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
         were fed to the synthesizer when training.
         """
         if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
             wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
         else:
             wav = fpath_or_wav
-        
+
         mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
         return mel_spectrogram
-    
+
     @staticmethod
     def griffin_lim(mel):
         """
diff --git a/synthesizer/preprocess.py b/synthesizer/preprocess.py
index cde325c41..08c58c40f 100644
--- a/synthesizer/preprocess.py
+++ b/synthesizer/preprocess.py
@@ -1,4 +1,4 @@
-from multiprocessing.pool import Pool 
+from multiprocessing.pool import Pool
 from synthesizer import audio
 from functools import partial
 from itertools import chain
@@ -10,26 +10,25 @@
 import librosa
 
 
-def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int,
-                           skip_existing: bool, hparams, no_alignments: bool,
-                           datasets_name: str, subfolders: str):
+def preprocess_dataset(datasets_root: Path, out_dir: Path, n_processes: int, skip_existing: bool, hparams,
+                       no_alignments: bool, datasets_name: str, subfolders: str):
     # Gather the input directories
     dataset_root = datasets_root.joinpath(datasets_name)
     input_dirs = [dataset_root.joinpath(subfolder.strip()) for subfolder in subfolders.split(",")]
     print("\n    ".join(map(str, ["Using data from:"] + input_dirs)))
     assert all(input_dir.exists() for input_dir in input_dirs)
-    
+
     # Create the output directories for each output file type
     out_dir.joinpath("mels").mkdir(exist_ok=True)
     out_dir.joinpath("audio").mkdir(exist_ok=True)
-    
+
     # Create a metadata file
     metadata_fpath = out_dir.joinpath("train.txt")
     metadata_file = metadata_fpath.open("a" if skip_existing else "w", encoding="utf-8")
 
     # Preprocess the dataset
     speaker_dirs = list(chain.from_iterable(input_dir.glob("*") for input_dir in input_dirs))
-    func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing, 
+    func = partial(preprocess_speaker, out_dir=out_dir, skip_existing=skip_existing,
                    hparams=hparams, no_alignments=no_alignments)
     job = Pool(n_processes).imap(func, speaker_dirs)
     for speaker_metadata in tqdm(job, datasets_name, len(speaker_dirs), unit="speakers"):
@@ -115,13 +114,13 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
     wav, _ = librosa.load(str(wav_fpath), hparams.sample_rate)
     if hparams.rescale:
         wav = wav / np.abs(wav).max() * hparams.rescaling_max
-    
+
     words = np.array(words)
     start_times = np.array([0.0] + end_times[:-1])
     end_times = np.array(end_times)
     assert len(words) == len(end_times) == len(start_times)
     assert words[0] == "" and words[-1] == ""
-    
+
     # Find pauses that are too long
     mask = (words == "") & (end_times - start_times >= hparams.silence_min_duration_split)
     mask[0] = mask[-1] = True
@@ -134,7 +133,7 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
     if len(noisy_wav) > hparams.sample_rate * 0.02:
         profile = logmmse.profile_noise(noisy_wav, hparams.sample_rate)
         wav = logmmse.denoise(wav, profile, eta=0)
-    
+
     # Re-attach segments that are too short
     segments = list(zip(breaks[:-1], breaks[1:]))
     segment_durations = [start_times[end] - end_times[start] for start, end in segments]
@@ -158,13 +157,13 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
             del segments[j + 1], segment_durations[j + 1]
         else:
             i += 1
-    
+
     # Split the utterance
     segment_times = [[end_times[start], start_times[end]] for start, end in segments]
     segment_times = (np.array(segment_times) * hparams.sample_rate).astype(np.int)
     wavs = [wav[segment_time[0]:segment_time[1]] for segment_time in segment_times]
     texts = [" ".join(words[start + 1:end]).replace("  ", " ") for start, end in segments]
-    
+
     # # DEBUG: play the audio segments (run with -n=1)
     # import sounddevice as sd
     # if len(wavs) > 1:
@@ -178,25 +177,25 @@ def split_on_silences(wav_fpath, words, end_times, hparams):
     #     print("\t%s" % text)
     #     sd.play(wav, 16000, blocking=True)
     # print("")
-    
+
     return wavs, texts
-    
-    
-def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, 
+
+
+def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
                       skip_existing: bool, hparams):
     ## FOR REFERENCE:
     # For you not to lose your head if you ever wish to change things here or implement your own
     # synthesizer.
     # - Both the audios and the mel spectrograms are saved as numpy arrays
-    # - There is no processing done to the audios that will be saved to disk beyond volume  
+    # - There is no processing done to the audios that will be saved to disk beyond volume
     #   normalization (in split_on_silences)
     # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
     #   is why we re-apply it on the audio on the side of the vocoder.
     # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
     #   without extra padding. This means that you won't have an exact relation between the length
     #   of the wav and of the mel spectrogram. See the vocoder data loader.
-    
-    
+
+
     # Skip existing utterances if needed
     mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
     wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
@@ -206,27 +205,27 @@ def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
     # Trim silence
     if hparams.trim_silence:
         wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True)
-    
+
     # Skip utterances that are too short
     if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
         return None
-    
+
     # Compute the mel spectrogram
     mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
     mel_frames = mel_spectrogram.shape[1]
-    
+
     # Skip utterances that are too long
     if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
         return None
-    
+
     # Write the spectrogram, embed and audio to disk
     np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
     np.save(wav_fpath, wav, allow_pickle=False)
-    
+
     # Return a tuple describing this training example
     return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text
- 
- 
+
+
 def embed_utterance(fpaths, encoder_model_fpath):
     if not encoder.is_loaded():
         encoder.load_model(encoder_model_fpath)
@@ -237,23 +236,23 @@ def embed_utterance(fpaths, encoder_model_fpath):
     wav = encoder.preprocess_wav(wav)
     embed = encoder.embed_utterance(wav)
     np.save(embed_fpath, embed, allow_pickle=False)
-    
- 
+
+
 def create_embeddings(synthesizer_root: Path, encoder_model_fpath: Path, n_processes: int):
     wav_dir = synthesizer_root.joinpath("audio")
     metadata_fpath = synthesizer_root.joinpath("train.txt")
     assert wav_dir.exists() and metadata_fpath.exists()
     embed_dir = synthesizer_root.joinpath("embeds")
     embed_dir.mkdir(exist_ok=True)
-    
+
     # Gather the input wave filepath and the target output embed filepath
     with metadata_fpath.open("r") as metadata_file:
         metadata = [line.split("|") for line in metadata_file]
         fpaths = [(wav_dir.joinpath(m[0]), embed_dir.joinpath(m[2])) for m in metadata]
-        
+
     # TODO: improve on the multiprocessing, it's terrible. Disk I/O is the bottleneck here.
     # Embed the utterances in separate threads
     func = partial(embed_utterance, encoder_model_fpath=encoder_model_fpath)
     job = Pool(n_processes).imap(func, fpaths)
     list(tqdm(job, "Embedding", len(fpaths), unit="utterances"))
-    
+
diff --git a/synthesizer/synthesize.py b/synthesizer/synthesize.py
index ffc7dc267..ff7e0023b 100644
--- a/synthesizer/synthesize.py
+++ b/synthesizer/synthesize.py
@@ -1,19 +1,23 @@
+import platform
+from functools import partial
+from pathlib import Path
+
+import numpy as np
 import torch
 from torch.utils.data import DataLoader
+from tqdm import tqdm
+
 from synthesizer.hparams import hparams_debug_string
-from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
 from synthesizer.models.tacotron import Tacotron
-from synthesizer.utils.text import text_to_sequence
+from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
+from synthesizer.utils import data_parallel_workaround
 from synthesizer.utils.symbols import symbols
-import numpy as np
-from pathlib import Path
-from tqdm import tqdm
-import platform
 
-def run_synthesis(in_dir, out_dir, model_dir, hparams):
+
+def run_synthesis(in_dir: Path, out_dir: Path, syn_model_fpath: Path, hparams):
     # This generates ground truth-aligned mels for vocoder training
-    synth_dir = Path(out_dir).joinpath("mels_gta")
-    synth_dir.mkdir(exist_ok=True)
+    synth_dir = out_dir / "mels_gta"
+    synth_dir.mkdir(exist_ok=True, parents=True)
     print(hparams_debug_string())
 
     # Check for GPU
@@ -42,10 +46,8 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
                      speaker_embedding_size=hparams.speaker_embedding_size).to(device)
 
     # Load the weights
-    model_dir = Path(model_dir)
-    model_fpath = model_dir.joinpath(model_dir.stem).with_suffix(".pt")
-    print("\nLoading weights at %s" % model_fpath)
-    model.load(model_fpath)
+    print("\nLoading weights at %s" % syn_model_fpath)
+    model.load(syn_model_fpath)
     print("Tacotron weights loaded from step %d" % model.step)
 
     # Synthesize using same reduction factor as the model is currently trained
@@ -55,26 +57,19 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
     model.eval()
 
     # Initialize the dataset
-    in_dir = Path(in_dir)
     metadata_fpath = in_dir.joinpath("train.txt")
     mel_dir = in_dir.joinpath("mels")
     embed_dir = in_dir.joinpath("embeds")
 
     dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
-    data_loader = DataLoader(dataset,
-                             collate_fn=lambda batch: collate_synthesizer(batch, r, hparams),
-                             batch_size=hparams.synthesis_batch_size,
-                             num_workers=2 if platform.system() != "Windows" else 0,
-                             shuffle=False,
-                             pin_memory=True)
+    collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
+    data_loader = DataLoader(dataset, hparams.synthesis_batch_size, collate_fn=collate_fn, num_workers=2)
 
     # Generate GTA mels
-    meta_out_fpath = Path(out_dir).joinpath("synthesized.txt")
-    with open(meta_out_fpath, "w") as file:
+    meta_out_fpath = out_dir / "synthesized.txt"
+    with meta_out_fpath.open("w") as file:
         for i, (texts, mels, embeds, idx) in tqdm(enumerate(data_loader), total=len(data_loader)):
-            texts = texts.to(device)
-            mels = mels.to(device)
-            embeds = embeds.to(device)
+            texts, mels, embeds = texts.to(device), mels.to(device), embeds.to(device)
 
             # Parallelize model onto GPUS using workaround due to python bug
             if device.type == "cuda" and torch.cuda.device_count() > 1:
diff --git a/synthesizer/train.py b/synthesizer/train.py
index a136cf9b3..d8cc170c4 100644
--- a/synthesizer/train.py
+++ b/synthesizer/train.py
@@ -1,7 +1,12 @@
+from datetime import datetime
+from functools import partial
+from pathlib import Path
+
 import torch
 import torch.nn.functional as F
 from torch import optim
 from torch.utils.data import DataLoader
+
 from synthesizer import audio
 from synthesizer.models.tacotron import Tacotron
 from synthesizer.synthesizer_dataset import SynthesizerDataset, collate_synthesizer
@@ -10,24 +15,17 @@
 from synthesizer.utils.symbols import symbols
 from synthesizer.utils.text import sequence_to_text
 from vocoder.display import *
-from datetime import datetime
-import numpy as np
-from pathlib import Path
-import sys
-import time
-import platform
 
 
 def np_now(x: torch.Tensor): return x.detach().cpu().numpy()
 
+
 def time_string():
     return datetime.now().strftime("%Y-%m-%d %H:%M")
 
-def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
-         backup_every: int, force_restart:bool, hparams):
 
-    syn_dir = Path(syn_dir)
-    models_dir = Path(models_dir)
+def train(run_id: str, syn_dir: Path, models_dir: Path, save_every: int,  backup_every: int, force_restart: bool,
+          hparams):
     models_dir.mkdir(exist_ok=True)
 
     model_dir = models_dir.joinpath(run_id)
@@ -40,20 +38,18 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
     wav_dir.mkdir(exist_ok=True)
     mel_output_dir.mkdir(exist_ok=True)
     meta_folder.mkdir(exist_ok=True)
-    
-    weights_fpath = model_dir.joinpath(run_id).with_suffix(".pt")
+
+    weights_fpath = model_dir / f"synthesizer.pt"
     metadata_fpath = syn_dir.joinpath("train.txt")
-    
+
     print("Checkpoint path: {}".format(weights_fpath))
     print("Loading training data from: {}".format(metadata_fpath))
     print("Using model: Tacotron")
-    
-    # Book keeping
-    step = 0
+
+    # Bookkeeping
     time_window = ValueWindow(100)
     loss_window = ValueWindow(100)
-    
-    
+
     # From WaveRNN/train_tacotron.py
     if torch.cuda.is_available():
         device = torch.device("cuda")
@@ -104,16 +100,12 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
         print("\nLoading weights at %s" % weights_fpath)
         model.load(weights_fpath, optimizer)
         print("Tacotron weights loaded from step %d" % model.step)
-    
+
     # Initialize the dataset
     metadata_fpath = syn_dir.joinpath("train.txt")
     mel_dir = syn_dir.joinpath("mels")
     embed_dir = syn_dir.joinpath("embeds")
     dataset = SynthesizerDataset(metadata_fpath, mel_dir, embed_dir, hparams)
-    test_loader = DataLoader(dataset,
-                             batch_size=1,
-                             shuffle=True,
-                             pin_memory=True)
 
     for i, session in enumerate(hparams.tts_schedule):
         current_step = model.get_step()
@@ -144,14 +136,10 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
         for p in optimizer.param_groups:
             p["lr"] = lr
 
-        data_loader = DataLoader(dataset,
-                                 collate_fn=lambda batch: collate_synthesizer(batch, r, hparams),
-                                 batch_size=batch_size,
-                                 num_workers=2 if platform.system() != "Windows" else 0,
-                                 shuffle=True,
-                                 pin_memory=True)
+        collate_fn = partial(collate_synthesizer, r=r, hparams=hparams)
+        data_loader = DataLoader(dataset, batch_size, shuffle=True, num_workers=2, collate_fn=collate_fn)
 
-        total_iters = len(dataset) 
+        total_iters = len(dataset)
         steps_per_epoch = np.ceil(total_iters / batch_size).astype(np.int32)
         epochs = np.ceil(training_steps / steps_per_epoch).astype(np.int32)
 
@@ -172,8 +160,7 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
                 # Forward pass
                 # Parallelize model onto GPUS using workaround due to python bug
                 if device.type == "cuda" and torch.cuda.device_count() > 1:
-                    m1_hat, m2_hat, attention, stop_pred = data_parallel_workaround(model, texts,
-                                                                                    mels, embeds)
+                    m1_hat, m2_hat, attention, stop_pred = data_parallel_workaround(model, texts, mels, embeds)
                 else:
                     m1_hat, m2_hat, attention, stop_pred = model(texts, mels, embeds)
 
@@ -200,15 +187,16 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
                 step = model.get_step()
                 k = step // 1000
 
-                msg = f"| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Loss: {loss_window.average:#.4} | {1./time_window.average:#.2} steps/s | Step: {k}k | "
+                msg = f"| Epoch: {epoch}/{epochs} ({i}/{steps_per_epoch}) | Loss: {loss_window.average:#.4} | " \
+                      f"{1./time_window.average:#.2} steps/s | Step: {k}k | "
                 stream(msg)
 
                 # Backup or save model as appropriate
-                if backup_every != 0 and step % backup_every == 0 : 
-                    backup_fpath = Path("{}/{}_{}k.pt".format(str(weights_fpath.parent), run_id, k))
+                if backup_every != 0 and step % backup_every == 0 :
+                    backup_fpath = weights_fpath.parent / f"synthesizer_{k:06d}.pt"
                     model.save(backup_fpath, optimizer)
 
-                if save_every != 0 and step % save_every == 0 : 
+                if save_every != 0 and step % save_every == 0 :
                     # Must save latest optimizer state to ensure that resuming training
                     # doesn't produce artifacts
                     model.save(weights_fpath, optimizer)
@@ -245,6 +233,7 @@ def train(run_id: str, syn_dir: str, models_dir: str, save_every: int,
             # Add line break after every epoch
             print("")
 
+
 def eval_model(attention, mel_prediction, target_spectrogram, input_seq, step,
                plot_dir, mel_output_dir, wav_dir, sample_num, loss, hparams):
     # Save some results for evaluation
diff --git a/synthesizer/utils/cleaners.py b/synthesizer/utils/cleaners.py
index eab63f05c..a793edaab 100644
--- a/synthesizer/utils/cleaners.py
+++ b/synthesizer/utils/cleaners.py
@@ -9,80 +9,80 @@
   3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
      the symbols in symbols.py to match your data).
 """
-
 import re
 from unidecode import unidecode
-from .numbers import normalize_numbers
+from synthesizer.utils.numbers import normalize_numbers
+
 
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r"\s+")
 
 # List of (regular expression, replacement) pairs for abbreviations:
 _abbreviations = [(re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1]) for x in [
-  ("mrs", "misess"),
-  ("mr", "mister"),
-  ("dr", "doctor"),
-  ("st", "saint"),
-  ("co", "company"),
-  ("jr", "junior"),
-  ("maj", "major"),
-  ("gen", "general"),
-  ("drs", "doctors"),
-  ("rev", "reverend"),
-  ("lt", "lieutenant"),
-  ("hon", "honorable"),
-  ("sgt", "sergeant"),
-  ("capt", "captain"),
-  ("esq", "esquire"),
-  ("ltd", "limited"),
-  ("col", "colonel"),
-  ("ft", "fort"),
+    ("mrs", "misess"),
+    ("mr", "mister"),
+    ("dr", "doctor"),
+    ("st", "saint"),
+    ("co", "company"),
+    ("jr", "junior"),
+    ("maj", "major"),
+    ("gen", "general"),
+    ("drs", "doctors"),
+    ("rev", "reverend"),
+    ("lt", "lieutenant"),
+    ("hon", "honorable"),
+    ("sgt", "sergeant"),
+    ("capt", "captain"),
+    ("esq", "esquire"),
+    ("ltd", "limited"),
+    ("col", "colonel"),
+    ("ft", "fort"),
 ]]
 
 
 def expand_abbreviations(text):
-  for regex, replacement in _abbreviations:
-    text = re.sub(regex, replacement, text)
-  return text
+    for regex, replacement in _abbreviations:
+        text = re.sub(regex, replacement, text)
+    return text
 
 
 def expand_numbers(text):
-  return normalize_numbers(text)
+    return normalize_numbers(text)
 
 
 def lowercase(text):
-  """lowercase input tokens."""
-  return text.lower()
+    """lowercase input tokens."""
+    return text.lower()
 
 
 def collapse_whitespace(text):
-  return re.sub(_whitespace_re, " ", text)
+    return re.sub(_whitespace_re, " ", text)
 
 
 def convert_to_ascii(text):
-  return unidecode(text)
+    return unidecode(text)
 
 
 def basic_cleaners(text):
-  """Basic pipeline that lowercases and collapses whitespace without transliteration."""
-  text = lowercase(text)
-  text = collapse_whitespace(text)
-  return text
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
 
 
 def transliteration_cleaners(text):
-  """Pipeline for non-English text that transliterates to ASCII."""
-  text = convert_to_ascii(text)
-  text = lowercase(text)
-  text = collapse_whitespace(text)
-  return text
+    """Pipeline for non-English text that transliterates to ASCII."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
 
 
 def english_cleaners(text):
-  """Pipeline for English text, including number and abbreviation expansion."""
-  text = convert_to_ascii(text)
-  text = lowercase(text)
-  text = expand_numbers(text)
-  text = expand_abbreviations(text)
-  text = collapse_whitespace(text)
-  return text
+    """Pipeline for English text, including number and abbreviation expansion."""
+    text = convert_to_ascii(text)
+    text = lowercase(text)
+    text = expand_numbers(text)
+    text = expand_abbreviations(text)
+    text = collapse_whitespace(text)
+    return text
diff --git a/synthesizer/utils/numbers.py b/synthesizer/utils/numbers.py
index 75020a0bd..1534daa3f 100644
--- a/synthesizer/utils/numbers.py
+++ b/synthesizer/utils/numbers.py
@@ -1,6 +1,7 @@
 import re
 import inflect
 
+
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
 _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
@@ -11,58 +12,58 @@
 
 
 def _remove_commas(m):
-  return m.group(1).replace(",", "")
+    return m.group(1).replace(",", "")
 
 
 def _expand_decimal_point(m):
-  return m.group(1).replace(".", " point ")
+    return m.group(1).replace(".", " point ")
 
 
 def _expand_dollars(m):
-  match = m.group(1)
-  parts = match.split(".")
-  if len(parts) > 2:
-    return match + " dollars"  # Unexpected format
-  dollars = int(parts[0]) if parts[0] else 0
-  cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-  if dollars and cents:
-    dollar_unit = "dollar" if dollars == 1 else "dollars"
-    cent_unit = "cent" if cents == 1 else "cents"
-    return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
-  elif dollars:
-    dollar_unit = "dollar" if dollars == 1 else "dollars"
-    return "%s %s" % (dollars, dollar_unit)
-  elif cents:
-    cent_unit = "cent" if cents == 1 else "cents"
-    return "%s %s" % (cents, cent_unit)
-  else:
-    return "zero dollars"
+    match = m.group(1)
+    parts = match.split(".")
+    if len(parts) > 2:
+        return match + " dollars"  # Unexpected format
+    dollars = int(parts[0]) if parts[0] else 0
+    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
+    if dollars and cents:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
+    elif dollars:
+        dollar_unit = "dollar" if dollars == 1 else "dollars"
+        return "%s %s" % (dollars, dollar_unit)
+    elif cents:
+        cent_unit = "cent" if cents == 1 else "cents"
+        return "%s %s" % (cents, cent_unit)
+    else:
+        return "zero dollars"
 
 
 def _expand_ordinal(m):
-  return _inflect.number_to_words(m.group(0))
+    return _inflect.number_to_words(m.group(0))
 
 
 def _expand_number(m):
-  num = int(m.group(0))
-  if num > 1000 and num < 3000:
-    if num == 2000:
-      return "two thousand"
-    elif num > 2000 and num < 2010:
-      return "two thousand " + _inflect.number_to_words(num % 100)
-    elif num % 100 == 0:
-      return _inflect.number_to_words(num // 100) + " hundred"
+    num = int(m.group(0))
+    if num > 1000 and num < 3000:
+        if num == 2000:
+            return "two thousand"
+        elif num > 2000 and num < 2010:
+            return "two thousand " + _inflect.number_to_words(num % 100)
+        elif num % 100 == 0:
+            return _inflect.number_to_words(num // 100) + " hundred"
+        else:
+            return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
     else:
-      return _inflect.number_to_words(num, andword="", zero="oh", group=2).replace(", ", " ")
-  else:
-    return _inflect.number_to_words(num, andword="")
+        return _inflect.number_to_words(num, andword="")
 
 
 def normalize_numbers(text):
-  text = re.sub(_comma_number_re, _remove_commas, text)
-  text = re.sub(_pounds_re, r"\1 pounds", text)
-  text = re.sub(_dollars_re, _expand_dollars, text)
-  text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-  text = re.sub(_ordinal_re, _expand_ordinal, text)
-  text = re.sub(_number_re, _expand_number, text)
-  return text
+    text = re.sub(_comma_number_re, _remove_commas, text)
+    text = re.sub(_pounds_re, r"\1 pounds", text)
+    text = re.sub(_dollars_re, _expand_dollars, text)
+    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
+    text = re.sub(_ordinal_re, _expand_ordinal, text)
+    text = re.sub(_number_re, _expand_number, text)
+    return text
diff --git a/synthesizer/utils/plot.py b/synthesizer/utils/plot.py
index f47d2713d..a470d169d 100644
--- a/synthesizer/utils/plot.py
+++ b/synthesizer/utils/plot.py
@@ -1,6 +1,3 @@
-import matplotlib
-matplotlib.use("Agg")
-import matplotlib.pyplot as plt
 import numpy as np
 
 
@@ -12,7 +9,12 @@ def split_title_line(title_text, max_words=5):
 	seq = title_text.split()
 	return "\n".join([" ".join(seq[i:i + max_words]) for i in range(0, len(seq), max_words)])
 
+
 def plot_alignment(alignment, path, title=None, split_title=False, max_len=None):
+	import matplotlib
+	matplotlib.use("Agg")
+	import matplotlib.pyplot as plt
+
 	if max_len is not None:
 		alignment = alignment[:, :max_len]
 
@@ -39,6 +41,10 @@ def plot_alignment(alignment, path, title=None, split_title=False, max_len=None)
 
 
 def plot_spectrogram(pred_spectrogram, path, title=None, split_title=False, target_spectrogram=None, max_len=None, auto_aspect=False):
+	import matplotlib
+	matplotlib.use("Agg")
+	import matplotlib.pyplot as plt
+
 	if max_len is not None:
 		target_spectrogram = target_spectrogram[:max_len]
 		pred_spectrogram = pred_spectrogram[:max_len]
diff --git a/synthesizer/utils/text.py b/synthesizer/utils/text.py
index 29372174a..7a56876b6 100644
--- a/synthesizer/utils/text.py
+++ b/synthesizer/utils/text.py
@@ -1,7 +1,8 @@
-from .symbols import symbols
-from . import cleaners
+from synthesizer.utils.symbols import symbols
+from synthesizer.utils import cleaners
 import re
 
+
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
@@ -11,64 +12,64 @@
 
 
 def text_to_sequence(text, cleaner_names):
-  """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
+    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
 
-    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+      The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+      in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
 
-    Args:
-      text: string to convert to a sequence
-      cleaner_names: names of the cleaner functions to run the text through
+      Args:
+        text: string to convert to a sequence
+        cleaner_names: names of the cleaner functions to run the text through
 
-    Returns:
-      List of integers corresponding to the symbols in the text
-  """
-  sequence = []
+      Returns:
+        List of integers corresponding to the symbols in the text
+    """
+    sequence = []
 
-  # Check for curly braces and treat their contents as ARPAbet:
-  while len(text):
-    m = _curly_re.match(text)
-    if not m:
-      sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
-      break
-    sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
-    sequence += _arpabet_to_sequence(m.group(2))
-    text = m.group(3)
+    # Check for curly braces and treat their contents as ARPAbet:
+    while len(text):
+        m = _curly_re.match(text)
+        if not m:
+            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
+            break
+        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _arpabet_to_sequence(m.group(2))
+        text = m.group(3)
 
-  # Append EOS token
-  sequence.append(_symbol_to_id["~"])
-  return sequence
+    # Append EOS token
+    sequence.append(_symbol_to_id["~"])
+    return sequence
 
 
 def sequence_to_text(sequence):
-  """Converts a sequence of IDs back to a string"""
-  result = ""
-  for symbol_id in sequence:
-    if symbol_id in _id_to_symbol:
-      s = _id_to_symbol[symbol_id]
-      # Enclose ARPAbet back in curly braces:
-      if len(s) > 1 and s[0] == "@":
-        s = "{%s}" % s[1:]
-      result += s
-  return result.replace("}{", " ")
+    """Converts a sequence of IDs back to a string"""
+    result = ""
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            # Enclose ARPAbet back in curly braces:
+            if len(s) > 1 and s[0] == "@":
+                s = "{%s}" % s[1:]
+            result += s
+    return result.replace("}{", " ")
 
 
 def _clean_text(text, cleaner_names):
-  for name in cleaner_names:
-    cleaner = getattr(cleaners, name)
-    if not cleaner:
-      raise Exception("Unknown cleaner: %s" % name)
-    text = cleaner(text)
-  return text
+    for name in cleaner_names:
+        cleaner = getattr(cleaners, name)
+        if not cleaner:
+            raise Exception("Unknown cleaner: %s" % name)
+        text = cleaner(text)
+    return text
 
 
 def _symbols_to_sequence(symbols):
-  return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
+    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
 
 
 def _arpabet_to_sequence(text):
-  return _symbols_to_sequence(["@" + s for s in text.split()])
+    return _symbols_to_sequence(["@" + s for s in text.split()])
 
 
 def _should_keep_symbol(s):
-  return s in _symbol_to_id and s not in ("_", "~")
+    return s in _symbol_to_id and s not in ("_", "~")
diff --git a/synthesizer_preprocess_audio.py b/synthesizer_preprocess_audio.py
index fd4d01d47..4ac9071dd 100644
--- a/synthesizer_preprocess_audio.py
+++ b/synthesizer_preprocess_audio.py
@@ -17,21 +17,19 @@
     parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help=\
         "Path to the output directory that will contain the mel spectrograms, the audios and the "
         "embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/")
-    parser.add_argument("-n", "--n_processes", type=int, default=None, help=\
+    parser.add_argument("-n", "--n_processes", type=int, default=4, help=\
         "Number of processes in parallel.")
     parser.add_argument("-s", "--skip_existing", action="store_true", help=\
         "Whether to overwrite existing files with the same name. Useful if the preprocessing was "
         "interrupted.")
     parser.add_argument("--hparams", type=str, default="", help=\
         "Hyperparameter overrides as a comma-separated list of name-value pairs")
-    parser.add_argument("--no_trim", action="store_true", help=\
-        "Preprocess audio without trimming silences (not recommended).")
     parser.add_argument("--no_alignments", action="store_true", help=\
         "Use this option when dataset does not include alignments\
         (these are used to split long audio files into sub-utterances.)")
     parser.add_argument("--datasets_name", type=str, default="LibriSpeech", help=\
         "Name of the dataset directory to process.")
-    parser.add_argument("--subfolders", type=str, default="train-clean-100, train-clean-360", help=\
+    parser.add_argument("--subfolders", type=str, default="train-clean-100,train-clean-360", help=\
         "Comma-separated list of subfolders to process inside your dataset directory")
     args = parser.parse_args()
 
@@ -43,16 +41,6 @@
     assert args.datasets_root.exists()
     args.out_dir.mkdir(exist_ok=True, parents=True)
 
-    # Verify webrtcvad is available
-    if not args.no_trim:
-        try:
-            import webrtcvad
-        except:
-            raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
-                "noise removal and is recommended. Please install and try again. If installation fails, "
-                "use --no_trim to disable this error message.")
-    del args.no_trim
-
     # Preprocess the dataset
     print_args(args, parser)
     args.hparams = hparams.parse(args.hparams)
diff --git a/synthesizer_preprocess_embeds.py b/synthesizer_preprocess_embeds.py
index 94f864d5d..84dd86a61 100644
--- a/synthesizer_preprocess_embeds.py
+++ b/synthesizer_preprocess_embeds.py
@@ -12,14 +12,14 @@
     parser.add_argument("synthesizer_root", type=Path, help=\
         "Path to the synthesizer training data that contains the audios and the train.txt file. "
         "If you let everything as default, it should be <datasets_root>/SV2TTS/synthesizer/.")
-    parser.add_argument("-e", "--encoder_model_fpath", type=Path, 
-                        default="encoder/saved_models/pretrained.pt", help=\
+    parser.add_argument("-e", "--encoder_model_fpath", type=Path,
+                        default="saved_models/default/encoder.pt", help=\
         "Path your trained encoder model.")
     parser.add_argument("-n", "--n_processes", type=int, default=4, help= \
         "Number of parallel processes. An encoder is created for each, so you may need to lower "
         "this value on GPUs with low memory. Set it to 1 if CUDA is unhappy.")
     args = parser.parse_args()
-    
+
     # Preprocess the dataset
     print_args(args, parser)
-    create_embeddings(**vars(args))    
+    create_embeddings(**vars(args))
diff --git a/synthesizer_train.py b/synthesizer_train.py
index 2743d590d..a01b4fd65 100644
--- a/synthesizer_train.py
+++ b/synthesizer_train.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 from synthesizer.hparams import hparams
 from synthesizer.train import train
 from utils.argutils import print_args
@@ -7,13 +9,13 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("run_id", type=str, help= \
-        "Name for this model instance. If a model state from the same run ID was previously "
-        "saved, the training will restart from there. Pass -f to overwrite saved states and "
-        "restart from scratch.")
-    parser.add_argument("syn_dir", type=str, default=argparse.SUPPRESS, help= \
+        "Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
+        "from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
+        "states and restart from scratch.")
+    parser.add_argument("syn_dir", type=Path, help= \
         "Path to the synthesizer directory that contains the ground truth mel spectrograms, "
         "the wavs and the embeds.")
-    parser.add_argument("-m", "--models_dir", type=str, default="synthesizer/saved_models/", help=\
+    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", help=\
         "Path to the output directory that will contain the saved model weights and the logs.")
     parser.add_argument("-s", "--save_every", type=int, default=1000, help= \
         "Number of steps between updates of the model on the disk. Set to 0 to never save the "
@@ -23,9 +25,8 @@
         "model.")
     parser.add_argument("-f", "--force_restart", action="store_true", help= \
         "Do not load any saved model and restart from scratch.")
-    parser.add_argument("--hparams", default="",
-                        help="Hyperparameter overrides as a comma-separated list of name=value "
-							 "pairs")
+    parser.add_argument("--hparams", default="", help=\
+        "Hyperparameter overrides as a comma-separated list of name=value pairs")
     args = parser.parse_args()
     print_args(args, parser)
 
diff --git a/toolbox/__init__.py b/toolbox/__init__.py
index 531d6adef..ad5b8aa70 100644
--- a/toolbox/__init__.py
+++ b/toolbox/__init__.py
@@ -1,16 +1,17 @@
-from toolbox.ui import UI
-from encoder import inference as encoder
-from synthesizer.inference import Synthesizer
-from vocoder import inference as vocoder
+import sys
+import traceback
 from pathlib import Path
 from time import perf_counter as timer
-from toolbox.utterance import Utterance
+
 import numpy as np
-import traceback
-import sys
 import torch
-import librosa
-from audioread.exceptions import NoBackendError
+
+from encoder import inference as encoder
+from synthesizer.inference import Synthesizer
+from toolbox.ui import UI
+from toolbox.utterance import Utterance
+from vocoder import inference as vocoder
+
 
 # Use this directory structure for your datasets, or modify it to fit your needs
 recognized_datasets = [
@@ -36,24 +37,17 @@
     "VCTK-Corpus/wav48",
 ]
 
-#Maximum of generated wavs to keep on memory
-MAX_WAVES = 15
+# Maximum of generated wavs to keep on memory
+MAX_WAVS = 15
+
 
 class Toolbox:
-    def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir, seed, no_mp3_support):
-        if not no_mp3_support:
-            try:
-                librosa.load("samples/6829_00000.mp3")
-            except NoBackendError:
-                print("Librosa will be unable to open mp3 files if additional software is not installed.\n"
-                  "Please install ffmpeg or add the '--no_mp3_support' option to proceed without support for mp3 files.")
-                exit(-1)
-        self.no_mp3_support = no_mp3_support
+    def __init__(self, datasets_root: Path, models_dir: Path, seed: int=None):
         sys.excepthook = self.excepthook
         self.datasets_root = datasets_root
         self.utterances = set()
         self.current_generated = (None, None, None, None) # speaker_name, spec, breaks, wav
-        
+
         self.synthesizer = None # type: Synthesizer
         self.current_wav = None
         self.waves_list = []
@@ -69,14 +63,14 @@ def __init__(self, datasets_root, enc_models_dir, syn_models_dir, voc_models_dir
 
         # Initialize the events and the interface
         self.ui = UI()
-        self.reset_ui(enc_models_dir, syn_models_dir, voc_models_dir, seed)
+        self.reset_ui(models_dir, seed)
         self.setup_events()
         self.ui.start()
 
     def excepthook(self, exc_type, exc_value, exc_tb):
         traceback.print_exception(exc_type, exc_value, exc_tb)
         self.ui.log("Exception: %s" % exc_value)
-        
+
     def setup_events(self):
         # Dataset, speaker and utterance selection
         self.ui.browser_load_button.clicked.connect(lambda: self.load_from_browser())
@@ -88,14 +82,14 @@ def setup_events(self):
         self.ui.random_utterance_button.clicked.connect(random_func(2))
         self.ui.dataset_box.currentIndexChanged.connect(random_func(1))
         self.ui.speaker_box.currentIndexChanged.connect(random_func(2))
-        
+
         # Model selection
         self.ui.encoder_box.currentIndexChanged.connect(self.init_encoder)
-        def func(): 
+        def func():
             self.synthesizer = None
         self.ui.synthesizer_box.currentIndexChanged.connect(func)
         self.ui.vocoder_box.currentIndexChanged.connect(self.init_vocoder)
-        
+
         # Utterance selection
         func = lambda: self.load_from_browser(self.ui.browse_file())
         self.ui.browser_browse_button.clicked.connect(func)
@@ -135,11 +129,11 @@ def export_current_wave(self):
     def replay_last_wav(self):
         self.ui.play(self.current_wav, Synthesizer.sample_rate)
 
-    def reset_ui(self, encoder_models_dir, synthesizer_models_dir, vocoder_models_dir, seed):
+    def reset_ui(self, models_dir: Path, seed: int=None):
         self.ui.populate_browser(self.datasets_root, recognized_datasets, 0, True)
-        self.ui.populate_models(encoder_models_dir, synthesizer_models_dir, vocoder_models_dir)
+        self.ui.populate_models(models_dir)
         self.ui.populate_gen_options(seed, self.trim_silences)
-        
+
     def load_from_browser(self, fpath=None):
         if fpath is None:
             fpath = Path(self.datasets_root,
@@ -148,37 +142,33 @@ def load_from_browser(self, fpath=None):
                          self.ui.current_utterance_name)
             name = str(fpath.relative_to(self.datasets_root))
             speaker_name = self.ui.current_dataset_name + '_' + self.ui.current_speaker_name
-            
+
             # Select the next utterance
             if self.ui.auto_next_checkbox.isChecked():
                 self.ui.browser_select_next()
         elif fpath == "":
-            return 
+            return
         else:
             name = fpath.name
             speaker_name = fpath.parent.name
 
-        if fpath.suffix.lower() == ".mp3" and self.no_mp3_support:
-                self.ui.log("Error: No mp3 file argument was passed but an mp3 file was used")
-                return
-
         # Get the wav from the disk. We take the wav with the vocoder/synthesizer format for
         # playback, so as to have a fair comparison with the generated audio
         wav = Synthesizer.load_preprocess_wav(fpath)
         self.ui.log("Loaded %s" % name)
 
         self.add_real_utterance(wav, name, speaker_name)
-        
+
     def record(self):
         wav = self.ui.record_one(encoder.sampling_rate, 5)
         if wav is None:
-            return 
+            return
         self.ui.play(wav, encoder.sampling_rate)
 
         speaker_name = "user01"
         name = speaker_name + "_rec_%05d" % np.random.randint(100000)
         self.add_real_utterance(wav, name, speaker_name)
-        
+
     def add_real_utterance(self, wav, name, speaker_name):
         # Compute the mel spectrogram
         spec = Synthesizer.make_spectrogram(wav)
@@ -198,15 +188,15 @@ def add_real_utterance(self, wav, name, speaker_name):
         # Plot it
         self.ui.draw_embed(embed, name, "current")
         self.ui.draw_umap_projections(self.utterances)
-        
+
     def clear_utterances(self):
         self.utterances.clear()
         self.ui.draw_umap_projections(self.utterances)
-        
+
     def synthesize(self):
         self.ui.log("Generating the mel spectrogram...")
         self.ui.set_loading(1)
-        
+
         # Update the synthesizer random seed
         if self.ui.random_seed_checkbox.isChecked():
             seed = int(self.ui.seed_textbox.text())
@@ -227,7 +217,7 @@ def synthesize(self):
         specs = self.synthesizer.synthesize_spectrograms(texts, embeds)
         breaks = [spec.shape[1] for spec in specs]
         spec = np.concatenate(specs, axis=1)
-        
+
         self.ui.draw_spec(spec, "generated")
         self.current_generated = (self.ui.selected_utterance.speaker_name, spec, breaks, None)
         self.ui.set_loading(0)
@@ -264,7 +254,7 @@ def vocoder_progress(i, seq_len, b_size, gen_rate):
             wav = Synthesizer.griffin_lim(spec)
         self.ui.set_loading(0)
         self.ui.log(" Done!", "append")
-        
+
         # Add breaks
         b_ends = np.cumsum(np.array(breaks) * Synthesizer.hparams.hop_size)
         b_starts = np.concatenate(([0], b_ends[:-1]))
@@ -286,7 +276,7 @@ def vocoder_progress(i, seq_len, b_size, gen_rate):
 
         #Update waves combobox
         self.waves_count += 1
-        if self.waves_count > MAX_WAVES:
+        if self.waves_count > MAX_WAVS:
           self.waves_list.pop()
           self.waves_namelist.pop()
         self.waves_list.insert(0, wav)
@@ -299,7 +289,7 @@ def vocoder_progress(i, seq_len, b_size, gen_rate):
 
         # Update current wav
         self.set_current_wav(0)
-        
+
         #Enable replay and save buttons:
         self.ui.replay_wav_button.setDisabled(False)
         self.ui.export_wav_button.setDisabled(False)
@@ -310,19 +300,19 @@ def vocoder_progress(i, seq_len, b_size, gen_rate):
             self.init_encoder()
         encoder_wav = encoder.preprocess_wav(wav)
         embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
-        
+
         # Add the utterance
         name = speaker_name + "_gen_%05d" % np.random.randint(100000)
         utterance = Utterance(name, speaker_name, wav, spec, embed, partial_embeds, True)
         self.utterances.add(utterance)
-        
+
         # Plot it
         self.ui.draw_embed(embed, name, "generated")
         self.ui.draw_umap_projections(self.utterances)
-        
+
     def init_encoder(self):
         model_fpath = self.ui.current_encoder_fpath
-        
+
         self.ui.log("Loading the encoder %s... " % model_fpath)
         self.ui.set_loading(1)
         start = timer()
@@ -339,13 +329,13 @@ def init_synthesizer(self):
         self.synthesizer = Synthesizer(model_fpath)
         self.ui.log("Done (%dms)." % int(1000 * (timer() - start)), "append")
         self.ui.set_loading(0)
-           
+
     def init_vocoder(self):
         model_fpath = self.ui.current_vocoder_fpath
         # Case of Griffin-lim
         if model_fpath is None:
-            return 
-    
+            return
+
         self.ui.log("Loading the vocoder %s... " % model_fpath)
         self.ui.set_loading(1)
         start = timer()
@@ -354,4 +344,4 @@ def init_vocoder(self):
         self.ui.set_loading(0)
 
     def update_seed_textbox(self):
-       self.ui.update_seed_textbox() 
+       self.ui.update_seed_textbox()
diff --git a/toolbox/ui.py b/toolbox/ui.py
index d56b5740e..e33998ba9 100644
--- a/toolbox/ui.py
+++ b/toolbox/ui.py
@@ -1,20 +1,21 @@
-import matplotlib.pyplot as plt
-from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
-from matplotlib.figure import Figure
-from PyQt5.QtCore import Qt, QStringListModel
-from PyQt5.QtWidgets import *
-from encoder.inference import plot_embedding_as_heatmap
-from toolbox.utterance import Utterance
+import sys
 from pathlib import Path
+from time import sleep
 from typing import List, Set
+from warnings import filterwarnings, warn
+
+import matplotlib.pyplot as plt
+import numpy as np
 import sounddevice as sd
 import soundfile as sf
-import numpy as np
-# from sklearn.manifold import TSNE         # You can try with TSNE if you like, I prefer UMAP 
-from time import sleep
 import umap
-import sys
-from warnings import filterwarnings, warn
+from PyQt5.QtCore import Qt, QStringListModel
+from PyQt5.QtWidgets import *
+from matplotlib.backends.backend_qt5agg import FigureCanvasQTAgg as FigureCanvas
+
+from encoder.inference import plot_embedding_as_heatmap
+from toolbox.utterance import Utterance
+
 filterwarnings("ignore")
 
 
@@ -33,7 +34,7 @@
     [0, 0, 0],
     [183, 183, 183],
     [76, 255, 0],
-], dtype=np.float) / 255 
+], dtype=np.float) / 255
 
 default_text = \
     "Welcome to the toolbox! To begin, load an utterance from your datasets or record one " \
@@ -48,26 +49,26 @@
     "utterances are of the same color as the speaker whose voice was used, but they're " \
     "represented with a cross."
 
-   
+
 class UI(QDialog):
     min_umap_points = 4
     max_log_lines = 5
     max_saved_utterances = 20
-    
+
     def draw_utterance(self, utterance: Utterance, which):
         self.draw_spec(utterance.spec, which)
         self.draw_embed(utterance.embed, utterance.name, which)
-    
+
     def draw_embed(self, embed, name, which):
         embed_ax, _ = self.current_ax if which == "current" else self.gen_ax
         embed_ax.figure.suptitle("" if embed is None else name)
-        
+
         ## Embedding
         # Clear the plot
         if len(embed_ax.images) > 0:
             embed_ax.images[0].colorbar.remove()
         embed_ax.clear()
-        
+
         # Draw the embed
         if embed is not None:
             plot_embedding_as_heatmap(embed, embed_ax)
@@ -84,11 +85,9 @@ def draw_spec(self, spec, which):
         # Draw the spectrogram
         spec_ax.clear()
         if spec is not None:
-            im = spec_ax.imshow(spec, aspect="auto", interpolation="none")
-            # spec_ax.figure.colorbar(mappable=im, shrink=0.65, orientation="horizontal", 
-            # spec_ax=spec_ax)
+            spec_ax.imshow(spec, aspect="auto", interpolation="none")
             spec_ax.set_title("mel spectrogram")
-    
+
         spec_ax.set_xticks([])
         spec_ax.set_yticks([])
         spec_ax.figure.canvas.draw()
@@ -104,22 +103,21 @@ def draw_umap_projections(self, utterances: Set[Utterance]):
 
         # Display a message if there aren't enough points
         if len(utterances) < self.min_umap_points:
-            self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" % 
-                              (self.min_umap_points - len(utterances)), 
+            self.umap_ax.text(.5, .5, "Add %d more points to\ngenerate the projections" %
+                              (self.min_umap_points - len(utterances)),
                               horizontalalignment='center', fontsize=15)
             self.umap_ax.set_title("")
-            
+
         # Compute the projections
         else:
             if not self.umap_hot:
                 self.log(
                     "Drawing UMAP projections for the first time, this will take a few seconds.")
                 self.umap_hot = True
-            
+
             reducer = umap.UMAP(int(np.ceil(np.sqrt(len(embeds)))), metric="cosine")
-            # reducer = TSNE()
             projections = reducer.fit_transform(embeds)
-            
+
             speakers_done = set()
             for projection, utterance in zip(projections, utterances):
                 color = colors[utterance.speaker_name]
@@ -128,7 +126,6 @@ def draw_umap_projections(self, utterances: Set[Utterance]):
                 speakers_done.add(utterance.speaker_name)
                 self.umap_ax.scatter(projection[0], projection[1], c=[color], marker=mark,
                                      label=label)
-            # self.umap_ax.set_title("UMAP projections")
             self.umap_ax.legend(prop={'size': 10})
 
         # Draw the plot
@@ -137,7 +134,7 @@ def draw_umap_projections(self, utterances: Set[Utterance]):
         self.umap_ax.set_yticks([])
         self.umap_ax.figure.canvas.draw()
 
-    def save_audio_file(self, wav, sample_rate):        
+    def save_audio_file(self, wav, sample_rate):
         dialog = QFileDialog()
         dialog.setDefaultSuffix(".wav")
         fpath, _ = dialog.getSaveFileName(
@@ -189,14 +186,14 @@ def setup_audio_devices(self, sample_rate):
         self.set_audio_device()
 
     def set_audio_device(self):
-        
+
         output_device = self.audio_out_devices_cb.currentText()
         if output_device == "None":
             output_device = None
 
         # If None, sounddevice queries portaudio
         sd.default.device = (self.audio_in_device, output_device)
-    
+
     def play(self, wav, sample_rate):
         try:
             sd.stop()
@@ -205,14 +202,14 @@ def play(self, wav, sample_rate):
             print(e)
             self.log("Error in audio playback. Try selecting a different audio output device.")
             self.log("Your device must be connected before you start the toolbox.")
-        
+
     def stop(self):
         sd.stop()
 
     def record_one(self, sample_rate, duration):
         self.record_button.setText("Recording...")
         self.record_button.setDisabled(True)
-        
+
         self.log("Recording %d seconds of audio" % duration)
         sd.stop()
         try:
@@ -222,31 +219,31 @@ def record_one(self, sample_rate, duration):
             self.log("Could not record anything. Is your recording device enabled?")
             self.log("Your device must be connected before you start the toolbox.")
             return None
-        
+
         for i in np.arange(0, duration, 0.1):
             self.set_loading(i, duration)
             sleep(0.1)
         self.set_loading(duration, duration)
         sd.wait()
-        
+
         self.log("Done recording.")
         self.record_button.setText("Record")
         self.record_button.setDisabled(False)
-        
+
         return wav.squeeze()
 
-    @property        
+    @property
     def current_dataset_name(self):
         return self.dataset_box.currentText()
 
     @property
     def current_speaker_name(self):
         return self.speaker_box.currentText()
-    
+
     @property
     def current_utterance_name(self):
         return self.utterance_box.currentText()
-    
+
     def browse_file(self):
         fpath = QFileDialog().getOpenFileName(
             parent=self,
@@ -254,11 +251,11 @@ def browse_file(self):
             filter="Audio Files (*.mp3 *.flac *.wav *.m4a)"
         )
         return Path(fpath[0]) if fpath[0] != "" else ""
-    
+
     @staticmethod
     def repopulate_box(box, items, random=False):
         """
-        Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join 
+        Resets a box and adds a list of items. Pass a list of (item, data) pairs instead to join
         data to the items
         """
         box.blockSignals(True)
@@ -270,7 +267,7 @@ def repopulate_box(box, items, random=False):
             box.setCurrentIndex(np.random.randint(len(items)) if random else 0)
         box.setDisabled(len(items) == 0)
         box.blockSignals(False)
-    
+
     def populate_browser(self, datasets_root: Path, recognized_datasets: List, level: int,
                          random=True):
         # Select a random dataset
@@ -282,13 +279,13 @@ def populate_browser(self, datasets_root: Path, recognized_datasets: List, level
             if datasets_root is None or len(datasets) == 0:
                 msg = "Warning: you d" + ("id not pass a root directory for datasets as argument" \
                     if datasets_root is None else "o not have any of the recognized datasets" \
-                                                  " in %s" % datasets_root) 
+                                                  " in %s" % datasets_root)
                 self.log(msg)
                 msg += ".\nThe recognized datasets are:\n\t%s\nFeel free to add your own. You " \
                        "can still use the toolbox by recording samples yourself." % \
                        ("\n\t".join(recognized_datasets))
                 print(msg, file=sys.stderr)
-                
+
                 self.random_utterance_button.setDisabled(True)
                 self.random_speaker_button.setDisabled(True)
                 self.random_dataset_button.setDisabled(True)
@@ -297,19 +294,19 @@ def populate_browser(self, datasets_root: Path, recognized_datasets: List, level
                 self.dataset_box.setDisabled(True)
                 self.browser_load_button.setDisabled(True)
                 self.auto_next_checkbox.setDisabled(True)
-                return 
+                return
             self.repopulate_box(self.dataset_box, datasets, random)
-    
+
         # Select a random speaker
         if level <= 1:
             speakers_root = datasets_root.joinpath(self.current_dataset_name)
             speaker_names = [d.stem for d in speakers_root.glob("*") if d.is_dir()]
             self.repopulate_box(self.speaker_box, speaker_names, random)
-    
+
         # Select a random utterance
         if level <= 2:
             utterances_root = datasets_root.joinpath(
-                self.current_dataset_name, 
+                self.current_dataset_name,
                 self.current_speaker_name
             )
             utterances = []
@@ -317,7 +314,7 @@ def populate_browser(self, datasets_root: Path, recognized_datasets: List, level
                 utterances.extend(Path(utterances_root).glob("**/*.%s" % extension))
             utterances = [fpath.relative_to(utterances_root) for fpath in utterances]
             self.repopulate_box(self.utterance_box, utterances, random)
-            
+
     def browser_select_next(self):
         index = (self.utterance_box.currentIndex() + 1) % len(self.utterance_box)
         self.utterance_box.setCurrentIndex(index)
@@ -325,44 +322,43 @@ def browser_select_next(self):
     @property
     def current_encoder_fpath(self):
         return self.encoder_box.itemData(self.encoder_box.currentIndex())
-    
+
     @property
     def current_synthesizer_fpath(self):
         return self.synthesizer_box.itemData(self.synthesizer_box.currentIndex())
-    
+
     @property
     def current_vocoder_fpath(self):
         return self.vocoder_box.itemData(self.vocoder_box.currentIndex())
 
-    def populate_models(self, encoder_models_dir: Path, synthesizer_models_dir: Path, 
-                        vocoder_models_dir: Path):
+    def populate_models(self, models_dir: Path):
         # Encoder
-        encoder_fpaths = list(encoder_models_dir.glob("*.pt"))
+        encoder_fpaths = list(models_dir.glob("*/encoder.pt"))
         if len(encoder_fpaths) == 0:
-            raise Exception("No encoder models found in %s" % encoder_models_dir)
-        self.repopulate_box(self.encoder_box, [(f.stem, f) for f in encoder_fpaths])
-        
+            raise Exception("No encoder models found in %s" % models_dir)
+        self.repopulate_box(self.encoder_box, [(f.parent.name, f) for f in encoder_fpaths])
+
         # Synthesizer
-        synthesizer_fpaths = list(synthesizer_models_dir.glob("**/*.pt"))
+        synthesizer_fpaths = list(models_dir.glob("*/synthesizer.pt"))
         if len(synthesizer_fpaths) == 0:
-            raise Exception("No synthesizer models found in %s" % synthesizer_models_dir)
-        self.repopulate_box(self.synthesizer_box, [(f.stem, f) for f in synthesizer_fpaths])
+            raise Exception("No synthesizer models found in %s" % models_dir)
+        self.repopulate_box(self.synthesizer_box, [(f.parent.name, f) for f in synthesizer_fpaths])
 
         # Vocoder
-        vocoder_fpaths = list(vocoder_models_dir.glob("**/*.pt"))
-        vocoder_items = [(f.stem, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)]
+        vocoder_fpaths = list(models_dir.glob("*/vocoder.pt"))
+        vocoder_items = [(f.parent.name, f) for f in vocoder_fpaths] + [("Griffin-Lim", None)]
         self.repopulate_box(self.vocoder_box, vocoder_items)
-        
+
     @property
     def selected_utterance(self):
         return self.utterance_history.itemData(self.utterance_history.currentIndex())
-        
+
     def register_utterance(self, utterance: Utterance):
         self.utterance_history.blockSignals(True)
         self.utterance_history.insertItem(0, utterance.name, utterance)
         self.utterance_history.setCurrentIndex(0)
         self.utterance_history.blockSignals(False)
-        
+
         if len(self.utterance_history) > self.max_saved_utterances:
             self.utterance_history.removeItem(self.max_saved_utterances)
 
@@ -380,7 +376,7 @@ def log(self, line, mode="newline"):
         elif mode == "overwrite":
             self.logs[-1] = line
         log_text = '\n'.join(self.logs)
-        
+
         self.log_window.setText(log_text)
         self.app.processEvents()
 
@@ -430,25 +426,25 @@ def __init__(self):
         self.app = QApplication(sys.argv)
         super().__init__(None)
         self.setWindowTitle("SV2TTS toolbox")
-        
-        
+
+
         ## Main layouts
         # Root
         root_layout = QGridLayout()
         self.setLayout(root_layout)
-        
+
         # Browser
         browser_layout = QGridLayout()
         root_layout.addLayout(browser_layout, 0, 0, 1, 2)
-        
+
         # Generation
         gen_layout = QVBoxLayout()
         root_layout.addLayout(gen_layout, 0, 2, 1, 2)
-        
+
         # Projections
         self.projections_layout = QVBoxLayout()
         root_layout.addLayout(self.projections_layout, 1, 0, 1, 1)
-        
+
         # Visualizations
         vis_layout = QVBoxLayout()
         root_layout.addLayout(vis_layout, 1, 1, 1, 3)
@@ -479,7 +475,7 @@ def __init__(self):
         self.browser_load_button = QPushButton("Load")
         browser_layout.addWidget(self.browser_load_button, i + 1, 3)
         i += 2
-        
+
         # Random buttons
         self.random_dataset_button = QPushButton("Random")
         browser_layout.addWidget(self.random_dataset_button, i, 0)
@@ -491,13 +487,13 @@ def __init__(self):
         self.auto_next_checkbox.setChecked(True)
         browser_layout.addWidget(self.auto_next_checkbox, i, 3)
         i += 1
-        
+
         # Utterance box
         browser_layout.addWidget(QLabel("<b>Use embedding from:</b>"), i, 0)
         self.utterance_history = QComboBox()
         browser_layout.addWidget(self.utterance_history, i, 1, 1, 3)
         i += 1
-        
+
         # Random & next utterance buttons
         self.browser_browse_button = QPushButton("Browse")
         browser_layout.addWidget(self.browser_browse_button, i, 0)
@@ -520,7 +516,7 @@ def __init__(self):
         self.vocoder_box = QComboBox()
         browser_layout.addWidget(QLabel("<b>Vocoder</b>"), i, 2)
         browser_layout.addWidget(self.vocoder_box, i + 1, 2)
-        
+
         self.audio_out_devices_cb=QComboBox()
         browser_layout.addWidget(QLabel("<b>Audio Output</b>"), i, 3)
         browser_layout.addWidget(self.audio_out_devices_cb, i + 1, 3)
@@ -546,12 +542,12 @@ def __init__(self):
         vis_layout.addStretch()
 
         gridspec_kw = {"width_ratios": [1, 4]}
-        fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0", 
+        fig, self.current_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
                                             gridspec_kw=gridspec_kw)
         fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
         vis_layout.addWidget(FigureCanvas(fig))
 
-        fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0", 
+        fig, self.gen_ax = plt.subplots(1, 2, figsize=(10, 2.25), facecolor="#F0F0F0",
                                         gridspec_kw=gridspec_kw)
         fig.subplots_adjust(left=0, bottom=0.1, right=1, top=0.8)
         vis_layout.addWidget(FigureCanvas(fig))
@@ -560,15 +556,15 @@ def __init__(self):
             ax.set_facecolor("#F0F0F0")
             for side in ["top", "right", "bottom", "left"]:
                 ax.spines[side].set_visible(False)
-        
-        
+
+
         ## Generation
         self.text_prompt = QPlainTextEdit(default_text)
         gen_layout.addWidget(self.text_prompt, stretch=1)
-        
+
         self.generate_button = QPushButton("Synthesize and vocode")
         gen_layout.addWidget(self.generate_button)
-        
+
         layout = QHBoxLayout()
         self.synthesize_button = QPushButton("Synthesize only")
         layout.addWidget(self.synthesize_button)
@@ -591,18 +587,18 @@ def __init__(self):
 
         self.loading_bar = QProgressBar()
         gen_layout.addWidget(self.loading_bar)
-        
+
         self.log_window = QLabel()
         self.log_window.setAlignment(Qt.AlignBottom | Qt.AlignLeft)
         gen_layout.addWidget(self.log_window)
         self.logs = []
         gen_layout.addStretch()
 
-        
+
         ## Set the size of the window and of the elements
         max_size = QDesktopWidget().availableGeometry(self).size() * 0.8
         self.resize(max_size)
-        
+
         ## Finalize the display
         self.reset_interface()
         self.show()
diff --git a/utils/default_models.py b/utils/default_models.py
new file mode 100644
index 000000000..8e2005f9f
--- /dev/null
+++ b/utils/default_models.py
@@ -0,0 +1,49 @@
+import urllib.request
+from pathlib import Path
+from threading import Thread
+
+from tqdm import tqdm
+
+
+default_models = {
+    "encoder": ("https://drive.google.com/uc?export=download&id=1q8mEGwCkFy23KZsinbuvdKAQLqNKbYf1", 17090379),
+    # Too large to put on google drive with a direct link...
+    "synthesizer": ("https://download1075.mediafire.com/qo9z9gv56uwg/02w4p210tuudu3u/pretrained.pt", 370554559),
+    "vocoder": ("https://drive.google.com/uc?export=download&id=1cf2NO6FtI0jDuy8AV3Xgn6leO6dHjIgu", 53845290),
+}
+
+
+class DownloadProgressBar(tqdm):
+    def update_to(self, b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            self.total = tsize
+        self.update(b * bsize - self.n)
+
+
+def download(url: str, target: Path, bar_pos=0):
+    # Ensure the directory exists
+    target.parent.mkdir(exist_ok=True, parents=True)
+
+    desc = f"Downloading {target}"
+    with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=desc, position=bar_pos, leave=False) as t:
+        urllib.request.urlretrieve(url, filename=target, reporthook=t.update_to)
+
+
+def ensure_default_models(models_dir: Path):
+    # Define download tasks
+    threads = []
+    for model_name, (url, size) in default_models.items():
+        target_path = models_dir / "default" / f"{model_name}.pt"
+        if target_path.exists():
+            if target_path.stat().st_size != size:
+                print(f"File {target_path} is not of expected size, redownloading...")
+            else:
+                continue
+
+        thread = Thread(target=download, args=(url, target_path, len(threads)))
+        thread.start()
+        threads.append(thread)
+
+    # Run and join threads
+    for thread in threads:
+        thread.join()
diff --git a/utils/modelutils.py b/utils/modelutils.py
deleted file mode 100644
index 6acaa984e..000000000
--- a/utils/modelutils.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from pathlib import Path
-
-def check_model_paths(encoder_path: Path, synthesizer_path: Path, vocoder_path: Path):
-    # This function tests the model paths and makes sure at least one is valid.
-    if encoder_path.is_file() or encoder_path.is_dir():
-        return
-    if synthesizer_path.is_file() or synthesizer_path.is_dir():
-        return
-    if vocoder_path.is_file() or vocoder_path.is_dir():
-        return
-
-    # If none of the paths exist, remind the user to download models if needed
-    print("********************************************************************************")
-    print("Error: Model files not found. Follow these instructions to get and install the models:")
-    print("https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models")
-    print("********************************************************************************\n")
-    quit(-1)
diff --git a/vocoder/display.py b/vocoder/display.py
index 956880722..3b416095a 100644
--- a/vocoder/display.py
+++ b/vocoder/display.py
@@ -1,4 +1,3 @@
-import matplotlib.pyplot as plt
 import time
 import numpy as np
 import sys
@@ -84,14 +83,18 @@ def time_since(started) :
         return f'{m}m {s}s'
 
 
-def save_attention(attn, path) :
+def save_attention(attn, path):
+    import matplotlib.pyplot as plt
+
     fig = plt.figure(figsize=(12, 6))
     plt.imshow(attn.T, interpolation='nearest', aspect='auto')
     fig.savefig(f'{path}.png', bbox_inches='tight')
     plt.close(fig)
 
 
-def save_spectrogram(M, path, length=None) :
+def save_spectrogram(M, path, length=None):
+    import matplotlib.pyplot as plt
+
     M = np.flip(M, axis=0)
     if length : M = M[:, :length]
     fig = plt.figure(figsize=(12, 6))
@@ -100,7 +103,9 @@ def save_spectrogram(M, path, length=None) :
     plt.close(fig)
 
 
-def plot(array) : 
+def plot(array):
+    import matplotlib.pyplot as plt
+
     fig = plt.figure(figsize=(30, 5))
     ax = fig.add_subplot(111)
     ax.xaxis.label.set_color('grey')
@@ -112,7 +117,9 @@ def plot(array) :
     plt.plot(array)
 
 
-def plot_spec(M) :
+def plot_spec(M):
+    import matplotlib.pyplot as plt
+
     M = np.flip(M, axis=0)
     plt.figure(figsize=(18,4))
     plt.imshow(M, interpolation='nearest', aspect='auto')
diff --git a/vocoder/train.py b/vocoder/train.py
index 6dc2f892e..f3187eb72 100644
--- a/vocoder/train.py
+++ b/vocoder/train.py
@@ -1,23 +1,25 @@
-from vocoder.models.fatchord_version import WaveRNN
-from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder
-from vocoder.distribution import discretized_mix_logistic_loss
-from vocoder.display import stream, simple_table
-from vocoder.gen_wavernn import gen_testset
-from torch.utils.data import DataLoader
+import time
 from pathlib import Path
-from torch import optim
-import torch.nn.functional as F
-import vocoder.hparams as hp
+
 import numpy as np
-import time
 import torch
-import platform
+import torch.nn.functional as F
+from torch import optim
+from torch.utils.data import DataLoader
 
-def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool,
-          save_every: int, backup_every: int, force_restart: bool):
+import vocoder.hparams as hp
+from vocoder.display import stream, simple_table
+from vocoder.distribution import discretized_mix_logistic_loss
+from vocoder.gen_wavernn import gen_testset
+from vocoder.models.fatchord_version import WaveRNN
+from vocoder.vocoder_dataset import VocoderDataset, collate_vocoder
+
+
+def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_truth: bool, save_every: int,
+          backup_every: int, force_restart: bool):
     # Check to make sure the hop length is correctly factorised
     assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length
-    
+
     # Instantiate the model
     print("Initializing the model...")
     model = WaveRNN(
@@ -37,20 +39,17 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
 
     if torch.cuda.is_available():
         model = model.cuda()
-        device = torch.device('cuda')
-    else:
-        device = torch.device('cpu')   
 
     # Initialize the optimizer
     optimizer = optim.Adam(model.parameters())
-    for p in optimizer.param_groups: 
+    for p in optimizer.param_groups:
         p["lr"] = hp.voc_lr
     loss_func = F.cross_entropy if model.mode == "RAW" else discretized_mix_logistic_loss
 
     # Load the weights
-    model_dir = models_dir.joinpath(run_id)
+    model_dir = models_dir / run_id
     model_dir.mkdir(exist_ok=True)
-    weights_fpath = model_dir.joinpath(run_id + ".pt")
+    weights_fpath = model_dir / "vocoder.pt"
     if force_restart or not weights_fpath.exists():
         print("\nStarting the training of WaveRNN from scratch\n")
         model.save(weights_fpath, optimizer)
@@ -58,37 +57,29 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
         print("\nLoading weights at %s" % weights_fpath)
         model.load(weights_fpath, optimizer)
         print("WaveRNN weights loaded from step %d" % model.step)
-    
+
     # Initialize the dataset
     metadata_fpath = syn_dir.joinpath("train.txt") if ground_truth else \
         voc_dir.joinpath("synthesized.txt")
     mel_dir = syn_dir.joinpath("mels") if ground_truth else voc_dir.joinpath("mels_gta")
     wav_dir = syn_dir.joinpath("audio")
     dataset = VocoderDataset(metadata_fpath, mel_dir, wav_dir)
-    test_loader = DataLoader(dataset,
-                             batch_size=1,
-                             shuffle=True,
-                             pin_memory=True)
+    test_loader = DataLoader(dataset, batch_size=1, shuffle=True)
 
     # Begin the training
     simple_table([('Batch size', hp.voc_batch_size),
                   ('LR', hp.voc_lr),
                   ('Sequence Len', hp.voc_seq_len)])
-    
+
     for epoch in range(1, 350):
-        data_loader = DataLoader(dataset,
-                                 collate_fn=collate_vocoder,
-                                 batch_size=hp.voc_batch_size,
-                                 num_workers=2 if platform.system() != "Windows" else 0,
-                                 shuffle=True,
-                                 pin_memory=True)
+        data_loader = DataLoader(dataset, hp.voc_batch_size, shuffle=True, num_workers=2, collate_fn=collate_vocoder)
         start = time.time()
         running_loss = 0.
 
         for i, (x, y, m) in enumerate(data_loader, 1):
             if torch.cuda.is_available():
                 x, m, y = x.cuda(), m.cuda(), y.cuda()
-            
+
             # Forward pass
             y_hat = model(x, m)
             if model.mode == 'RAW':
@@ -96,7 +87,7 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
             elif model.mode == 'MOL':
                 y = y.float()
             y = y.unsqueeze(-1)
-            
+
             # Backward pass
             loss = loss_func(y_hat, y)
             optimizer.zero_grad()
@@ -112,7 +103,7 @@ def train(run_id: str, syn_dir: Path, voc_dir: Path, models_dir: Path, ground_tr
 
             if backup_every != 0 and step % backup_every == 0 :
                 model.checkpoint(model_dir, optimizer)
-                
+
             if save_every != 0 and step % save_every == 0 :
                 model.save(weights_fpath, optimizer)
 
diff --git a/vocoder_preprocess.py b/vocoder_preprocess.py
index 7ede3dfb9..32a4d450a 100644
--- a/vocoder_preprocess.py
+++ b/vocoder_preprocess.py
@@ -1,59 +1,48 @@
-from synthesizer.synthesize import run_synthesis
-from synthesizer.hparams import hparams
-from utils.argutils import print_args
 import argparse
 import os
+from pathlib import Path
+
+from synthesizer.hparams import hparams
+from synthesizer.synthesize import run_synthesis
+from utils.argutils import print_args
+
 
 
 if __name__ == "__main__":
     class MyFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter):
         pass
-    
+
     parser = argparse.ArgumentParser(
         description="Creates ground-truth aligned (GTA) spectrograms from the vocoder.",
         formatter_class=MyFormatter
     )
-    parser.add_argument("datasets_root", type=str, help=\
+    parser.add_argument("datasets_root", type=Path, help=\
         "Path to the directory containing your SV2TTS directory. If you specify both --in_dir and "
         "--out_dir, this argument won't be used.")
-    parser.add_argument("--model_dir", type=str, 
-                        default="synthesizer/saved_models/pretrained/", help=\
-        "Path to the pretrained model directory.")
-    parser.add_argument("-i", "--in_dir", type=str, default=argparse.SUPPRESS, help= \
+    parser.add_argument("-s", "--syn_model_fpath", type=Path,
+                        default="saved_models/default/synthesizer.pt",
+                        help="Path to a saved synthesizer")
+    parser.add_argument("-i", "--in_dir", type=Path, default=argparse.SUPPRESS, help= \
         "Path to the synthesizer directory that contains the mel spectrograms, the wavs and the "
         "embeds. Defaults to  <datasets_root>/SV2TTS/synthesizer/.")
-    parser.add_argument("-o", "--out_dir", type=str, default=argparse.SUPPRESS, help= \
+    parser.add_argument("-o", "--out_dir", type=Path, default=argparse.SUPPRESS, help= \
         "Path to the output vocoder directory that will contain the ground truth aligned mel "
         "spectrograms. Defaults to <datasets_root>/SV2TTS/vocoder/.")
-    parser.add_argument("--hparams", default="",
-                        help="Hyperparameter overrides as a comma-separated list of name=value "
-                             "pairs")
-    parser.add_argument("--no_trim", action="store_true", help=\
-        "Preprocess audio without trimming silences (not recommended).")
+    parser.add_argument("--hparams", default="", help=\
+        "Hyperparameter overrides as a comma-separated list of name=value pairs")
     parser.add_argument("--cpu", action="store_true", help=\
         "If True, processing is done on CPU, even when a GPU is available.")
     args = parser.parse_args()
     print_args(args, parser)
     modified_hp = hparams.parse(args.hparams)
-    
+
     if not hasattr(args, "in_dir"):
-        args.in_dir = os.path.join(args.datasets_root, "SV2TTS", "synthesizer")
+        args.in_dir = args.datasets_root / "SV2TTS" / "synthesizer"
     if not hasattr(args, "out_dir"):
-        args.out_dir = os.path.join(args.datasets_root, "SV2TTS", "vocoder")
+        args.out_dir = args.datasets_root / "SV2TTS" / "vocoder"
 
     if args.cpu:
         # Hide GPUs from Pytorch to force CPU processing
         os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
-    
-    # Verify webrtcvad is available
-    if not args.no_trim:
-        try:
-            import webrtcvad
-        except:
-            raise ModuleNotFoundError("Package 'webrtcvad' not found. This package enables "
-                "noise removal and is recommended. Please install and try again. If installation fails, "
-                "use --no_trim to disable this error message.")
-    del args.no_trim
-
-    run_synthesis(args.in_dir, args.out_dir, args.model_dir, modified_hp)
 
+    run_synthesis(args.in_dir, args.out_dir, args.syn_model_fpath, modified_hp)
diff --git a/vocoder_train.py b/vocoder_train.py
index d712ffa3e..2a90cfd48 100644
--- a/vocoder_train.py
+++ b/vocoder_train.py
@@ -1,7 +1,8 @@
+import argparse
+from pathlib import Path
+
 from utils.argutils import print_args
 from vocoder.train import train
-from pathlib import Path
-import argparse
 
 
 if __name__ == "__main__":
@@ -10,21 +11,21 @@
                     "or ground truth mels.",
         formatter_class=argparse.ArgumentDefaultsHelpFormatter
     )
-    
+
     parser.add_argument("run_id", type=str, help= \
-        "Name for this model instance. If a model state from the same run ID was previously "
-        "saved, the training will restart from there. Pass -f to overwrite saved states and "
-        "restart from scratch.")
-    parser.add_argument("datasets_root", type=str, help= \
+        "Name for this model. By default, training outputs will be stored to saved_models/<run_id>/. If a model state "
+        "from the same run ID was previously saved, the training will restart from there. Pass -f to overwrite saved "
+        "states and restart from scratch.")
+    parser.add_argument("datasets_root", type=Path, help= \
         "Path to the directory containing your SV2TTS directory. Specifying --syn_dir or --voc_dir "
         "will take priority over this argument.")
-    parser.add_argument("--syn_dir", type=str, default=argparse.SUPPRESS, help= \
+    parser.add_argument("--syn_dir", type=Path, default=argparse.SUPPRESS, help= \
         "Path to the synthesizer directory that contains the ground truth mel spectrograms, "
         "the wavs and the embeds. Defaults to <datasets_root>/SV2TTS/synthesizer/.")
-    parser.add_argument("--voc_dir", type=str, default=argparse.SUPPRESS, help= \
+    parser.add_argument("--voc_dir", type=Path, default=argparse.SUPPRESS, help= \
         "Path to the vocoder directory that contains the GTA synthesized mel spectrograms. "
         "Defaults to <datasets_root>/SV2TTS/vocoder/. Unused if --ground_truth is passed.")
-    parser.add_argument("-m", "--models_dir", type=str, default="vocoder/saved_models/", help=\
+    parser.add_argument("-m", "--models_dir", type=Path, default="saved_models", help=\
         "Path to the directory that will contain the saved model weights, as well as backups "
         "of those weights and wavs generated during training.")
     parser.add_argument("-g", "--ground_truth", action="store_true", help= \
@@ -41,16 +42,12 @@
 
     # Process the arguments
     if not hasattr(args, "syn_dir"):
-        args.syn_dir = Path(args.datasets_root, "SV2TTS", "synthesizer")
-    args.syn_dir = Path(args.syn_dir)
+        args.syn_dir = args.datasets_root / "SV2TTS" / "synthesizer"
     if not hasattr(args, "voc_dir"):
-        args.voc_dir = Path(args.datasets_root, "SV2TTS", "vocoder")
-    args.voc_dir = Path(args.voc_dir)
+        args.voc_dir = args.datasets_root / "SV2TTS" / "vocoder"
     del args.datasets_root
-    args.models_dir = Path(args.models_dir)
     args.models_dir.mkdir(exist_ok=True)
 
     # Run the training
     print_args(args, parser)
     train(**vars(args))
-    
\ No newline at end of file