Merge pull request deezer#444 from deezer/pad_waveform

Added padding at the begining to avoid tf STFT reconstruction error
Quemandoacromo · Jul 24, 2020 · ca5cdd7 · ca5cdd7
2 parents 4744ffb + 3fcc4ea
commit ca5cdd7
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 35 deletions.
diff --git a/spleeter/commands/__init__.py b/spleeter/commands/__init__.py
@@ -170,6 +170,7 @@ def _create_evaluate_parser(parser_factory):
     parser.add_argument('-o', '--output_path', **OPT_OUTPUT)
     parser.add_argument('--mus_dir', **OPT_MUSDB)
     parser.add_argument('-m', '--mwf', **OPT_MWF)
+    parser.add_argument('-B', '--stft-backend', **OPT_STFT_BACKEND)
     return parser
 
 

diff --git a/spleeter/commands/evaluate.py b/spleeter/commands/evaluate.py
@@ -77,7 +77,7 @@ def _separate_evaluation_dataset(arguments, musdb_root_directory, params):
             bitrate='128k',
             MWF=arguments.MWF,
             verbose=arguments.verbose,
-            stft_backend="auto"),
+            stft_backend=arguments.stft_backend),
         params)
     return audio_output_directory
 

diff --git a/spleeter/model/__init__.py b/spleeter/model/__init__.py
@@ -275,9 +275,16 @@ def _build_stft_feature(self):
         spec_name = self.spectrogram_name
 
         if stft_name not in self._features:
+            # pad input with a frame of zeros
+            waveform =  tf.concat([
+                            tf.zeros((self._frame_length, self._n_channels)),
+                            self._features['waveform']
+                            ],
+                            0
+                        )
             stft_feature = tf.transpose(
                 stft(
-                    tf.transpose(self._features['waveform']),
+                    tf.transpose(waveform),
                     self._frame_length,
                     self._frame_step,
                     window_fn=lambda frame_length, dtype: (
@@ -341,7 +348,7 @@ def _inverse_stft(self, stft_t, time_crop=None):
         reshaped = tf.transpose(inversed)
         if time_crop is None:
             time_crop = tf.shape(self._features['waveform'])[0]
-        return reshaped[:time_crop, :]
+        return reshaped[self._frame_length:self._frame_length+time_crop, :]
 
     def _build_mwf_output_waveform(self):
         """ Perform separation with multichannel Wiener Filtering using Norbert.

diff --git a/tests/test_eval.py b/tests/test_eval.py
@@ -25,33 +25,64 @@
 
 from spleeter.utils.configuration import load_configuration
 
-res_4stems = {  "vocals": {
-                    "SDR": -0.007,
-                    "SAR": -19.231,
-                    "SIR": -4.528,
-                    "ISR": 0.000
-                },
-                "drums": {
-                    "SDR": -0.071,
-                    "SAR": -14.496,
-                    "SIR": -4.987,
-                    "ISR": 0.001
-                },
-                "bass":{
-                    "SDR": -0.001,
-                    "SAR": -12.426,
-                    "SIR": -7.198,
-                    "ISR": -0.001
+BACKENDS = ["tensorflow", "librosa"]
+TEST_CONFIGURATIONS = {el:el for el in BACKENDS}
+
+res_4stems = {
+                "librosa": {
+                    "vocals": {
+                        "SDR": -0.007,
+                        "SAR": -19.231,
+                        "SIR": -4.528,
+                        "ISR": 0.000
+                    },
+                    "drums": {
+                        "SDR": -0.071,
+                        "SAR": -14.496,
+                        "SIR": -4.987,
+                        "ISR": 0.001
+                    },
+                    "bass":{
+                        "SDR": -0.001,
+                        "SAR": -12.426,
+                        "SIR": -7.198,
+                        "ISR": -0.001
+                    },
+                    "other":{
+                        "SDR": -1.453,
+                        "SAR": -14.899,
+                        "SIR": -4.678,
+                        "ISR": -0.015
+                    }
                 },
-                "other":{
-                    "SDR": -1.453,
-                    "SAR": -14.899,
-                    "SIR": -4.678,
-                    "ISR": -0.015
+                "tensorflow": {
+                    "vocals": {
+                        "SDR": 3.25e-05,
+                        "SAR": -11.153575,
+                        "SIR": -1.3849,
+                        "ISR": 2.75e-05
+                    },
+                    "drums": {
+                        "SDR": -0.079505,
+                        "SAR": -15.7073575,
+                        "SIR": -4.972755,
+                        "ISR": 0.0013575
+                    },
+                    "bass":{
+                        "SDR": 2.5e-06,
+                        "SAR": -10.3520575,
+                        "SIR": -4.272325,
+                        "ISR": 2.5e-06
+                    },
+                    "other":{
+                        "SDR": -1.359175,
+                        "SAR": -14.7076775,
+                        "SIR": -4.761505,
+                        "ISR": -0.01528
+                    }
                 }
             }
 
-
 def generate_fake_eval_dataset(path):
     aa = get_default_audio_adapter()
     n_songs = 2
@@ -68,12 +99,18 @@ def generate_fake_eval_dataset(path):
             aa.save(filename, data, fs)
 
 
-def test_evaluate(path="FAKE_MUSDB_DIR"):
-    generate_fake_eval_dataset(path)
-    p = create_argument_parser()
-    arguments = p.parse_args(["evaluate", "-p", "spleeter:4stems", "--mus_dir", path])
-    params = load_configuration(arguments.configuration)
-    metrics = evaluate.entrypoint(arguments, params)
-    for instrument, metric in metrics.items():
-        for metric, value in metric.items():
-            assert np.allclose(np.median(value), res_4stems[instrument][metric], atol=1e-3)
+@pytest.mark.parametrize('backend', TEST_CONFIGURATIONS)
+def test_evaluate(backend):
+    with TemporaryDirectory() as directory:
+
+        generate_fake_eval_dataset(directory)
+        p = create_argument_parser()
+        arguments = p.parse_args(["evaluate", "-p", "spleeter:4stems", "--mus_dir", directory, "-B", backend])
+        params = load_configuration(arguments.configuration)
+        metrics = evaluate.entrypoint(arguments, params)
+        for instrument, metric in metrics.items():
+            for metric, value in metric.items():
+                assert np.allclose(np.median(value), res_4stems[backend][instrument][metric], atol=1e-3)
+
+
+# test_evaluate("tensorflow")