Cleanup pr 331 (#366)

[#366] Add CPU support. Also some updates for tensorflow v2 compatibility (in work) Co-authored-by: pusalieth <pusalieth@users.noreply.github.com>
CorentinJ · Jun 22, 2020 · 1b8d2e7 · 1b8d2e7
1 parent 5d6d9ff
commit 1b8d2e7
Show file tree

Hide file tree

Showing 14 changed files with 188 additions and 160 deletions.
diff --git a/demo_cli.py b/demo_cli.py
@@ -5,6 +5,7 @@
 from vocoder import inference as vocoder
 from pathlib import Path
 import numpy as np
+import soundfile as sf
 import librosa
 import argparse
 import torch
@@ -30,6 +31,7 @@
         "overhead but allows to save some GPU memory for lower-end GPUs.")
     parser.add_argument("--no_sound", action="store_true", help=\
         "If True, audio won't be played.")
+    parser.add_argument("--cpu", help="Use CPU.", action="store_true")
     args = parser.parse_args()
     print_args(args, parser)
     if not args.no_sound:
@@ -38,22 +40,25 @@
 
     ## Print some environment information (for debugging purposes)
     print("Running a test of your configuration...\n")
-    if not torch.cuda.is_available():
-        print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
+    if args.cpu:
+        print("Using CPU for inference.")
+    elif torch.cuda.is_available():
+        device_id = torch.cuda.current_device()
+        gpu_properties = torch.cuda.get_device_properties(device_id)
+        print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
+            "%.1fGb total memory.\n" % 
+            (torch.cuda.device_count(),
+            device_id,
+            gpu_properties.name,
+            gpu_properties.major,
+            gpu_properties.minor,
+            gpu_properties.total_memory / 1e9))
+    else:
+        print("Your PyTorch installation is not configured. If you have a GPU ready "
               "for deep learning, ensure that the drivers are properly installed, and that your "
-              "CUDA version matches your PyTorch installation. CPU-only inference is currently "
-              "not supported.", file=sys.stderr)
+              "CUDA version matches your PyTorch installation.", file=sys.stderr)
+        print("\nIf you're trying to use a cpu, please use the option --cpu.", file=sys.stderr)
         quit(-1)
-    device_id = torch.cuda.current_device()
-    gpu_properties = torch.cuda.get_device_properties(device_id)
-    print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
-          "%.1fGb total memory.\n" % 
-          (torch.cuda.device_count(),
-           device_id,
-           gpu_properties.name,
-           gpu_properties.major,
-           gpu_properties.minor,
-           gpu_properties.total_memory / 1e9))
 
 
     ## Load the models one by one.
@@ -172,15 +177,13 @@
                 sd.play(generated_wav, synthesizer.sample_rate)
 
             # Save it on the disk
-            fpath = "demo_output_%02d.wav" % num_generated
+            filename = "demo_output_%02d.wav" % num_generated
             print(generated_wav.dtype)
-            librosa.output.write_wav(fpath, generated_wav.astype(np.float32), 
-                                     synthesizer.sample_rate)
+            sf.write(filename, generated_wav.astype(np.float32), synthesizer.sample_rate)
             num_generated += 1
-            print("\nSaved output as %s\n\n" % fpath)
+            print("\nSaved output as %s\n\n" % filename)
 
 
         except Exception as e:
             print("Caught exception: %s" % repr(e))
             print("Restarting\n")
-
diff --git a/encoder/inference.py b/encoder/inference.py
@@ -30,7 +30,7 @@ def load_model(weights_fpath: Path, device=None):
     elif isinstance(device, str):
         _device = torch.device(device)
     _model = SpeakerEncoder(_device, torch.device("cpu"))
-    checkpoint = torch.load(weights_fpath)
+    checkpoint = torch.load(weights_fpath, _device)
     _model.load_state_dict(checkpoint["model_state"])
     _model.eval()
     print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))

diff --git a/encoder/train.py b/encoder/train.py
@@ -7,11 +7,12 @@
 import torch
 
 def sync(device: torch.device):
-    # FIXME
-    return 
     # For correct profiling (cuda operations are async)
     if device.type == "cuda":
         torch.cuda.synchronize(device)
+    else:
+        torch.cpu.synchronize(device)
+
 
 def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
           backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
@@ -122,4 +123,3 @@ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int,
             }, backup_fpath)
 
         profiler.tick("Extras (visualizations, saving)")
-
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,8 @@
-tensorflow-gpu>=1.10.0,<=1.14.0
+# each portion of tensorflow is needed
+# core package is for RNN, cpu and gpu are for specific system speed-ups
+tensorflow==1.15
+tensorflow-cpu==1.15
+tensorflow-gpu==1.15
 umap-learn
 visdom
 webrtcvad

diff --git a/synthesizer/feeder.py b/synthesizer/feeder.py
@@ -70,22 +70,22 @@ def __init__(self, coordinator, metadata_filename, hparams):
 			# Create placeholders for inputs and targets. Don"t specify batch size because we want
 			# to be able to feed different batch sizes at eval time.
 			self._placeholders = [
-				tf.placeholder(tf.int32, shape=(None, None), name="inputs"),
-				tf.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
-				tf.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
+				tf.compat.v1.placeholder(tf.int32, shape=(None, None), name="inputs"),
+				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="input_lengths"),
+				tf.compat.v1.placeholder(tf.float32, shape=(None, None, hparams.num_mels), 
 							   name="mel_targets"),
-				tf.placeholder(tf.float32, shape=(None, None), name="token_targets"),
-				tf.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
-				tf.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
+				tf.compat.v1.placeholder(tf.float32, shape=(None, None), name="token_targets"),
+				tf.compat.v1.placeholder(tf.int32, shape=(None, ), name="targets_lengths"),
+				tf.compat.v1.placeholder(tf.int32, shape=(hparams.tacotron_num_gpus, None), 
 							   name="split_infos"),
 
 				# SV2TTS
-				tf.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
+				tf.compat.v1.placeholder(tf.float32, shape=(None, hparams.speaker_embedding_size), 
 							   name="speaker_embeddings")
 			]
 
 			# Create queue for buffering data
-			queue = tf.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
+			queue = tf.queue.FIFOQueue(8, [tf.int32, tf.int32, tf.float32, tf.float32, 
 									 tf.int32, tf.int32, tf.float32], name="input_queue")
 			self._enqueue_op = queue.enqueue(self._placeholders)
 			self.inputs, self.input_lengths, self.mel_targets, self.token_targets, \
@@ -100,7 +100,7 @@ def __init__(self, coordinator, metadata_filename, hparams):
 			self.speaker_embeddings.set_shape(self._placeholders[6].shape)
 
 			# Create eval queue for buffering eval data
-			eval_queue = tf.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
+			eval_queue = tf.queue.FIFOQueue(1, [tf.int32, tf.int32, tf.float32, tf.float32,  
 										  tf.int32, tf.int32, tf.float32], name="eval_queue")
 			self._eval_enqueue_op = eval_queue.enqueue(self._placeholders)
 			self.eval_inputs, self.eval_input_lengths, self.eval_mel_targets, \

diff --git a/synthesizer/inference.py b/synthesizer/inference.py
@@ -54,7 +54,7 @@ def load(self):
         """
         if self._low_mem:
             raise Exception("Cannot load the synthesizer permanently in low mem mode")
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
         self._model = Tacotron2(self.checkpoint_fpath, hparams)
 
     def synthesize_spectrograms(self, texts: List[str],
@@ -88,7 +88,7 @@ def synthesize_spectrograms(self, texts: List[str],
     @staticmethod
     def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
         # Load the model and forward the inputs
-        tf.reset_default_graph()
+        tf.compat.v1.reset_default_graph()
         model = Tacotron2(checkpoint_fpath, hparams)
         specs, alignments = model.my_synthesize(embeddings, texts)
 
@@ -134,4 +134,3 @@ def griffin_lim(mel):
         with the same parameters present in hparams.py.
         """
         return audio.inv_mel_spectrogram(mel, hparams)
-
diff --git a/synthesizer/models/attention.py b/synthesizer/models/attention.py
@@ -60,10 +60,10 @@ def _location_sensitive_score(W_query, W_fil, W_keys):
 	dtype = W_query.dtype
 	num_units = W_keys.shape[-1].value or array_ops.shape(W_keys)[-1]
 
-	v_a = tf.get_variable(
+	v_a = tf.compat.v1.get_variable(
 		"attention_variable_projection", shape=[num_units], dtype=dtype,
 		initializer=tf.contrib.layers.xavier_initializer())
-	b_a = tf.get_variable(
+	b_a = tf.compat.v1.get_variable(
 		"attention_bias", shape=[num_units], dtype=dtype,
 		initializer=tf.zeros_initializer())
 
@@ -155,10 +155,10 @@ def __init__(self,
 				probability_fn=normalization_function,
 				name=name)
 
-		self.location_convolution = tf.layers.Conv1D(filters=hparams.attention_filters,
+		self.location_convolution = tf.compat.v1.layers.Conv1D(filters=hparams.attention_filters,
 			kernel_size=hparams.attention_kernel, padding="same", use_bias=True,
 			bias_initializer=tf.zeros_initializer(), name="location_features_convolution")
-		self.location_layer = tf.layers.Dense(units=num_units, use_bias=False,
+		self.location_layer = tf.compat.v1.layers.Dense(units=num_units, use_bias=False,
 			dtype=tf.float32, name="location_features_layer")
 		self._cumulate = cumulate_weights
 

diff --git a/synthesizer/models/helpers.py b/synthesizer/models/helpers.py
@@ -119,7 +119,7 @@ def next_inputs(self, time, outputs, state, sample_ids, stop_token_prediction, n
 
 			#Pick previous outputs randomly with respect to teacher forcing ratio
 			next_inputs = tf.cond(
-				tf.less(tf.random_uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
+				tf.less(tf.random.uniform([], minval=0, maxval=1, dtype=tf.float32), self._ratio),
 				lambda: self._targets[:, time, :], #Teacher-forcing: return true frame
 				lambda: outputs[:,-self._output_dim:])