Merge pull request #13 from Spijkervet/pitch_tests

Spijkervet · web-flow · commit 6ae5931ae377 · 2021-11-15T18:19:43.000Z
End-to-end PitchShift transform tests
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 
 # What packages are required for this module to be executed?
 REQUIRED = ["numpy", "torch", "torchaudio", "julius", "wavaugment", "torch-pitch-shift"]
-TEST_REQUIRED = ["pytest", "black"]
+TEST_REQUIRED = ["pytest", "black", "librosa"]
 
 # What packages are optional?
 EXTRAS = {
diff --git a/tests/test_augmentations.py b/tests/test_augmentations.py
@@ -1,7 +1,7 @@
-import numpy as np
+import librosa
 import torch
-import torchaudio
 import pytest
+import numpy as np
 
 from torchaudio_augmentations import (
     Compose,
@@ -162,14 +162,45 @@ def test_pitch_shift_fast_ratios():
 
 def test_pitch_shift_no_fast_ratios():
     with pytest.raises(ValueError):
-        ps = PitchShift(
+        _ = PitchShift(
             n_samples=num_samples,
             sample_rate=sample_rate,
             pitch_shift_min=4,
             pitch_shift_max=4,
         )
 
 
+def test_pitch_shift_transform_with_pitch_detection():
+    """To check semi-tone values, check: http://www.homepages.ucl.ac.uk/~sslyjjt/speech/semitone.html"""
+
+    source_frequency = 440
+    max_semitone_shift = 4
+    expected_frequency_shift = 554
+
+    num_channels = 1
+    audio = generate_waveform(
+        sample_rate, num_samples, num_channels, frequency=source_frequency
+    )
+    pitch_shift = PitchShift(
+        n_samples=num_samples,
+        sample_rate=sample_rate,
+        pitch_shift_min=max_semitone_shift,
+        pitch_shift_max=max_semitone_shift + 1,
+    )
+
+    t_audio = pitch_shift(audio)
+    librosa_audio = t_audio[0].numpy()
+    f0_hz, _, _ = librosa.pyin(librosa_audio, fmin=10, fmax=1000)
+
+    # remove nan values:
+    f0_hz = f0_hz[~np.isnan(f0_hz)]
+
+    detected_f0_hz = np.max(f0_hz)
+
+    # the detected frequency vs. expected frequency should not be smaller than 20Hz.
+    assert abs(detected_f0_hz - expected_frequency_shift) < 20
+
+
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_reverb(num_channels):
     audio = generate_waveform(sample_rate, num_samples, num_channels)
diff --git a/tests/utils.py b/tests/utils.py
@@ -3,7 +3,10 @@
 
 
 def generate_waveform(
-    sample_rate: int, num_samples: int, num_channels: int
+    sample_rate: int,
+    num_samples: int,
+    num_channels: int,
+    frequency: int = 440,
 ) -> torch.Tensor:
 
     # Dividing x legnth value into three parts:- 1/10, 1/2, 4/10.
@@ -18,8 +21,7 @@ def generate_waveform(
     sustain = np.ones(sustain_length) * sustain_value
     attack_decay_sustain = np.concatenate((attack, decay, sustain))
 
-    freq = 440
-    wavedata = np.sin(2 * np.pi * np.arange(num_samples) * freq / sample_rate)
+    wavedata = np.sin(2 * np.pi * np.arange(num_samples) * frequency / sample_rate)
 
     wavedata = wavedata * attack_decay_sustain