Updated pitch detection backend to torch-pitch-shift

Spijkervet · Spijkervet · commit a355b68d700e · 2021-11-15T17:39:53.000Z
diff --git a/setup.py b/setup.py
@@ -18,14 +18,16 @@
 EMAIL = "janne.spijkervet@gmail.com"
 AUTHOR = "Janne Spijkervet"
 REQUIRES_PYTHON = ">=3.6.0"
-VERSION = "0.2.2"
+VERSION = "0.2.3"
 
 # What packages are required for this module to be executed?
-REQUIRED = ["numpy", "torch", "torchaudio", "julius", "wavaugment"]
+REQUIRED = ["numpy", "torch", "torchaudio", "julius", "wavaugment", "torch-pitch-shift"]
+TEST_REQUIRED = ["pytest"]
 
 # What packages are optional?
 EXTRAS = {
-    'fancy feature': [''],
+    "fancy feature": [""],
+    "test": TEST_REQUIRED,
 }
 
 # The rest you shouldn't have to touch too much :)
diff --git a/tests/test_augmentations.py b/tests/test_augmentations.py
@@ -33,11 +33,37 @@ def test_random_resized_crop(num_channels):
     assert audio.shape[1] == num_samples
 
 
+@pytest.mark.parametrize(
+    ["batch_size", "num_channels"],
+    [
+        (1, 1),
+        (4, 1),
+        (16, 1),
+        (1, 2),
+        (4, 2),
+        (16, 2),
+    ],
+)
+def test_random_resized_crop_batched(batch_size, num_channels):
+
+    num_samples = 22050 * 5
+    audio = generate_waveform(sample_rate, num_samples, num_channels)
+    audio = audio.repeat(batch_size, 1, 1)
+
+    transform = Compose([RandomResizedCrop(num_samples)])
+
+    audio = transform(audio)
+    assert audio.shape[0] == batch_size
+    assert audio.shape[1] == num_channels
+    assert audio.shape[2] == num_samples
+
+
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_polarity(num_channels):
-    audio = generate_waveform(sample_rate, num_samples,
-                              num_channels=num_channels)
-    transform = Compose([PolarityInversion()],)
+    audio = generate_waveform(sample_rate, num_samples, num_channels=num_channels)
+    transform = Compose(
+        [PolarityInversion()],
+    )
 
     t_audio = transform(audio)
     assert (t_audio == torch.neg(audio)).all()
@@ -47,7 +73,9 @@ def test_polarity(num_channels):
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_filter(num_channels):
     audio = generate_waveform(sample_rate, num_samples, num_channels)
-    transform = Compose([HighLowPass(sample_rate=sample_rate)],)
+    transform = Compose(
+        [HighLowPass(sample_rate=sample_rate)],
+    )
     t_audio = transform(audio)
     # torchaudio.save("tests/filter.wav", t_audio, sample_rate=sample_rate)
     assert t_audio.shape == audio.shape
@@ -56,7 +84,9 @@ def test_filter(num_channels):
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_delay(num_channels):
     audio = generate_waveform(sample_rate, num_samples, num_channels)
-    transform = Compose([Delay(sample_rate=sample_rate)],)
+    transform = Compose(
+        [Delay(sample_rate=sample_rate)],
+    )
 
     t_audio = transform(audio)
     # torchaudio.save("tests/delay.wav", t_audio, sample_rate=sample_rate)
@@ -66,7 +96,9 @@ def test_delay(num_channels):
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_gain(num_channels):
     audio = generate_waveform(sample_rate, num_samples, num_channels)
-    transform = Compose([Gain()],)
+    transform = Compose(
+        [Gain()],
+    )
 
     t_audio = transform(audio)
     # torchaudio.save("tests/gain.wav", t_audio, sample_rate=sample_rate)
@@ -76,7 +108,9 @@ def test_gain(num_channels):
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_noise(num_channels):
     audio = generate_waveform(sample_rate, num_samples, num_channels)
-    transform = Compose([Noise(min_snr=0.5, max_snr=1)],)
+    transform = Compose(
+        [Noise(min_snr=0.5, max_snr=1)],
+    )
 
     t_audio = transform(audio)
     # torchaudio.save("tests/noise.wav", t_audio, sample_rate=sample_rate)
@@ -87,17 +121,41 @@ def test_noise(num_channels):
 def test_pitch(num_channels):
     audio = generate_waveform(sample_rate, num_samples, num_channels)
     transform = Compose(
-        [PitchShift(n_samples=num_samples, sample_rate=sample_rate)],)
+        [PitchShift(n_samples=num_samples, sample_rate=sample_rate)],
+    )
 
     t_audio = transform(audio)
-    # torchaudio.save("tests/pitch.wav", t_audio, sample_rate=sample_rate)
+    # torchaudio.save("tests/pitch.wav", audio, sample_rate=sample_rate)
+    # torchaudio.save("tests/t_pitch.wav", t_audio, sample_rate=sample_rate)
     assert t_audio.shape == audio.shape
 
 
+def test_pitch_shift_fast_ratios():
+    ps = PitchShift(
+        n_samples=num_samples,
+        sample_rate=sample_rate,
+        pitch_shift_min=-5,
+        pitch_shift_max=5,
+    )
+    assert len(ps.fast_shifts) == 20
+
+
+def test_pitch_shift_no_fast_ratios():
+    with pytest.raises(ValueError):
+        ps = PitchShift(
+            n_samples=num_samples,
+            sample_rate=sample_rate,
+            pitch_shift_min=4,
+            pitch_shift_max=4,
+        )
+
+
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_reverb(num_channels):
     audio = generate_waveform(sample_rate, num_samples, num_channels)
-    transform = Compose([Reverb(sample_rate=sample_rate)],)
+    transform = Compose(
+        [Reverb(sample_rate=sample_rate)],
+    )
 
     t_audio = transform(audio)
     # torchaudio.save("tests/reverb.wav", t_audio, sample_rate=sample_rate)
@@ -107,7 +165,9 @@ def test_reverb(num_channels):
 @pytest.mark.parametrize("num_channels", [1, 2])
 def test_reverse(num_channels):
     stereo_audio = generate_waveform(sample_rate, num_samples, num_channels)
-    transform = Compose([Reverse()],)
+    transform = Compose(
+        [Reverse()],
+    )
 
     t_audio = transform(stereo_audio)
     # torchaudio.save("tests/reverse.wav", t_audio, sample_rate=sample_rate)
diff --git a/torchaudio_augmentations/augmentations/pitch_shift.py b/torchaudio_augmentations/augmentations/pitch_shift.py
@@ -1,40 +1,63 @@
 import random
 import torch
-import augment
+from torchaudio_augmentations.utils import (
+    add_audio_batch_dimension,
+    remove_audio_batch_dimension,
+    tensor_has_valid_audio_batch_dimension,
+)
+from fractions import Fraction
+from typing import Optional
+from torch_pitch_shift import get_fast_shifts, pitch_shift, semitones_to_ratio
 
 
 class PitchShift:
     def __init__(
-        self, n_samples, sample_rate, pitch_cents_min=-700, pitch_cents_max=700
+        self,
+        n_samples,
+        sample_rate,
+        pitch_shift_min: int = -7.0,
+        pitch_shift_max: int = 7.0,
+        bins_per_octave: Optional[int] = 12,
     ):
         self.n_samples = n_samples
         self.sample_rate = sample_rate
-        self.pitch_cents_min = pitch_cents_min
-        self.pitch_cents_max = pitch_cents_max
-        self.src_info = {"rate": self.sample_rate}
-
-    def __call__(self, audio):
-        n_steps = random.randint(self.pitch_cents_min, self.pitch_cents_max)
-        effect_chain = augment.EffectChain().pitch(n_steps).rate(self.sample_rate)
-
-        num_channels = audio.shape[0]
-        target_info = {
-            "channels": num_channels,
-            "length": self.n_samples,
-            "rate": self.sample_rate,
-        }
-        y = effect_chain.apply(audio, src_info=self.src_info, target_info=target_info)
-
-        # sox might misbehave sometimes by giving nan/inf if sequences are too short (or silent)
-        # and the effect chain includes eg `pitch`
-        if torch.isnan(y).any() or torch.isinf(y).any():
-            return audio.clone()
-
-        if y.shape[1] != audio.shape[1]:
-            if y.shape[1] > audio.shape[1]:
-                y = y[:, : audio.shape[1]]
-            else:
-                y0 = torch.zeros(1, audio.shape[1]).to(y.device)
-                y0[:, : y.shape[1]] = y
-                y = y0
+        self.pitch_shift_min = pitch_shift_min
+        self.pitch_shift_max = pitch_shift_max
+        self.bins_per_octave = bins_per_octave
+
+        self._fast_shifts = get_fast_shifts(
+            sample_rate,
+            lambda x: x >= semitones_to_ratio(self.pitch_shift_min)
+            and x <= semitones_to_ratio(self.pitch_shift_max)
+            and x != 1,
+        )
+
+        if len(self._fast_shifts) == 0:
+            raise ValueError(
+                f"Could not compute any fast pitch-shift ratios for the given sample rate and pitch shift range: {self.pitch_shift_min} - {self.pitch_shift_max} (semitones)"
+            )
+
+    @property
+    def fast_shifts(self):
+        return self._fast_shifts
+
+    def draw_sample_uniform_from_fast_shifts(self) -> Fraction:
+        return random.choice(self.fast_shifts)
+
+    def __call__(self, audio: torch.Tensor) -> torch.Tensor:
+        is_batched = False
+        if not tensor_has_valid_audio_batch_dimension(audio):
+            audio = add_audio_batch_dimension(audio)
+            is_batched = True
+
+        fast_shift = self.draw_sample_uniform_from_fast_shifts()
+        y = pitch_shift(
+            input=audio,
+            shift=fast_shift,
+            sample_rate=self.sample_rate,
+            bins_per_octave=self.bins_per_octave,
+        )
+
+        if is_batched:
+            y = remove_audio_batch_dimension(y)
         return y
diff --git a/torchaudio_augmentations/augmentations/random_resized_crop.py b/torchaudio_augmentations/augmentations/random_resized_crop.py
@@ -8,7 +8,7 @@ def __init__(self, n_samples):
         self.n_samples = n_samples
 
     def forward(self, audio):
-        max_samples = audio.shape[1]
+        max_samples = audio.shape[-1]
         start_idx = random.randint(0, max_samples - self.n_samples)
-        audio = audio[:, start_idx : start_idx + self.n_samples]
+        audio = audio[..., start_idx : start_idx + self.n_samples]
         return audio
diff --git a/torchaudio_augmentations/augmentations/reverse.py b/torchaudio_augmentations/augmentations/reverse.py
@@ -7,4 +7,4 @@ def __init__(self):
         super().__init__()
 
     def forward(self, audio):
-        return torch.flip(audio, dims=[1])
+        return torch.flip(audio, dims=[-1])
diff --git a/torchaudio_augmentations/utils.py b/torchaudio_augmentations/utils.py
@@ -0,0 +1,15 @@
+import torch
+
+
+def tensor_has_valid_audio_batch_dimension(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.ndim == 3:
+        return True
+    return False
+
+
+def add_audio_batch_dimension(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.unsqueeze(dim=0)
+
+
+def remove_audio_batch_dimension(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.squeeze(dim=0)