pytorch · nateanl · Aug 23, 2022 · Aug 29, 2022 · Aug 29, 2022 · Aug 29, 2022
@@ -72,7 +72,7 @@ fi
 (
     set -x
     conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa>=0.8.0' parameterized 'requests>=2.20'
-    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag
+    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

@@ -90,7 +90,8 @@ esac
         unidecode \
         'protobuf<4.21.0' \
         demucs \
-        tinytag
+        tinytag \
+        pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

@@ -58,6 +58,7 @@ endif()
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
 option(BUILD_KALDI "Build kaldi statically" ON)
+option(BUILD_RIR "Enable RIR simulation" ON)
 option(BUILD_RNNT "Enable RNN transducer" ON)
 option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON)
 option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)

@@ -18,3 +18,8 @@ fftconvolve
 ~~~~~~~~~~~
 
 .. autofunction:: fftconvolve
+
+simulate_rir_ism
+~~~~~~~~~~~~~~~~
+
+.. autofunction:: simulate_rir_ism
@@ -423,3 +423,11 @@ @article{CATTONI2021101155
 keywords = {Spoken language translation, Multilingual corpus},
 abstract = {End-to-end spoken language translation (SLT) has recently gained popularity thanks to the advancement of sequence to sequence learning in its two parent tasks: automatic speech recognition (ASR) and machine translation (MT). However, research in the field has to confront with the scarcity of publicly available corpora to train data-hungry neural networks. Indeed, while traditional cascade solutions can build on sizable ASR and MT training data for a variety of languages, the available SLT corpora suitable for end-to-end training are few, typically small and of limited language coverage. We contribute to fill this gap by presenting MuST-C, a large and freely available Multilingual Speech Translation Corpus built from English TED Talks. Its unique features include: i) language coverage and diversity (from English into 14 languages from different families), ii) size (at least 237 hours of transcribed recordings per language, 430 on average), iii) variety of topics and speakers, and iv) data quality. Besides describing the corpus creation methodology and discussing the outcomes of empirical and manual quality evaluations, we present baseline results computed with strong systems on each language direction covered by MuST-C.}
 }
+@inproceedings{scheibler2018pyroomacoustics,
+  title={Pyroomacoustics: A python package for audio room simulation and array processing algorithms},
+  author={Scheibler, Robin and Bezzam, Eric and Dokmani{\'c}, Ivan},
+  booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
+  pages={351--355},
+  year={2018},
+  organization={IEEE}
+}
@@ -1,9 +1,14 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .autograd_test_impl import AutogradTestImpl
+from .autograd_test_impl import AutogradTestImpl, AutogradTestRIRImpl
 
 
 class TestAutogradCPUFloat64(AutogradTestImpl, PytorchTestCase):
     dtype = torch.float64
     device = torch.device("cpu")
+
+
+class TestAutogradRIRCPUFloat64(AutogradTestRIRImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
@@ -31,3 +31,24 @@ def test_add_noise(self):
 
         self.assertTrue(gradcheck(F.add_noise, (waveform, noise, lengths, snr)))
         self.assertTrue(gradgradcheck(F.add_noise, (waveform, noise, lengths, snr)))
+
+
+class AutogradTestRIRImpl(TestBaseMixin):
+    @parameterized.expand([(2, 1), (3, 4)])
+    def test_simulate_rir_ism(self, D, channel):
+        room = torch.rand(D, dtype=self.dtype, device=self.device, requires_grad=True)
+        mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device, requires_grad=True)
+        source = torch.rand(D, dtype=self.dtype, device=self.device, requires_grad=True)
+        max_order = 2
+        e_absorption = 0.5
+        output_length = 1000
+        self.assertTrue(
+            gradcheck(
+                F.simulate_rir_ism, (room, source, mic_array, max_order, e_absorption, output_length), atol=1e-3, rtol=1
+            )
+        )
+        self.assertTrue(
+            gradgradcheck(
+                F.simulate_rir_ism, (room, source, mic_array, max_order, e_absorption, output_length), atol=1e-3, rtol=1
+            )
+        )
@@ -1,7 +1,7 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .functional_test_impl import FunctionalTestImpl
+from .functional_test_impl import FunctionalTestImpl, FunctionalTestRIRImpl
 
 
 class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
@@ -12,3 +12,13 @@ class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
 class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase):
     dtype = torch.float64
     device = torch.device("cpu")
+
+
+class FunctionalRIRFloat32Test(FunctionalTestRIRImpl, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device("cpu")
+
+
+class FunctionalRIRFloat64Test(FunctionalTestRIRImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
@@ -1,9 +1,14 @@
 import numpy as np
+
+try:
+    import pyroomacoustics as pra
 if _mod_utils.is_module_available("soundfile"): 
     import soundfile 
 if _mod_utils.is_module_available("soundfile"): 
     import soundfile 
+except Exception:
+    pass
 import torch
 import torchaudio.prototype.functional as F
 from parameterized import parameterized
 from scipy import signal
-from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
+from torchaudio_unittest.common_utils import nested_params, skipIfNoModule, TestBaseMixin
 
 
 class FunctionalTestImpl(TestBaseMixin):
@@ -107,3 +112,79 @@ def test_add_noise_length_check(self):
 
         with self.assertRaisesRegex(ValueError, "Length dimensions"):
             F.add_noise(waveform, noise, lengths, snr)
+
+
+class FunctionalTestRIRImpl(TestBaseMixin):
+    @skipIfNoModule("pyroomacoustics")
+    @parameterized.expand([(2, 1), (3, 4)])
+    def test_simulate_rir_ism_single_band(self, D, channel):
+        """Test simulate_rir_ism function in the case where absorption coefficients are identical for all walls."""
+        room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        e_absorption = 0.5
+        room = pra.ShoeBox(
+            room_dim.detach().numpy(),
+            fs=16000,
+            materials=pra.Material(e_absorption),
+            max_order=max_order,
+            ray_tracing=False,
+            air_absorption=False,
+        )
+        mic_locs = np.asarray([mic_array[i].tolist() for i in range(channel)]).swapaxes(0, 1)
+        room.add_microphone_array(mic_locs)
+        room.add_source(source.tolist())
+        room.compute_rir()
+        max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
+        expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
+        for i in range(channel):
+            expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
+        actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, e_absorption)
+        self.assertEqual(expected, actual, atol=1e-3, rtol=2)
+
+    @skipIfNoModule("pyroomacoustics")
+    @parameterized.expand([(2, 1), (3, 4)])
+    def test_simulate_rir_ism_multi_band(self, D, channel):
+        """Test simulate_rir_ism in the case where absorption coefficients are different for all walls."""
+        room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        if D == 2:
+            e_absorption = torch.rand(7, 4, dtype=self.dtype, device=self.device)
+            walls = ["west", "east", "south", "north"]
+        else:
+            e_absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
+            walls = ["west", "east", "south", "north", "floor", "ceiling"]
+        room = pra.ShoeBox(
+            room_dim.detach().numpy(),
+            fs=16000,
+            materials={
+                walls[i]: pra.Material(
+                    {
+                        "coeffs": e_absorption[:, i]
+                        .reshape(
+                            -1,
+                        )
+                        .detach()
+                        .numpy(),
+                        "center_freqs": [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0],
+                    }
+                )
+                for i in range(len(walls))
+            },
+            max_order=max_order,
+            ray_tracing=False,
+            air_absorption=False,
+        )
+        mic_locs = np.asarray([mic_array[i].tolist() for i in range(channel)]).swapaxes(0, 1)
+        room.add_microphone_array(mic_locs)
+        room.add_source(source.tolist())
+        room.compute_rir()
+        max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
+        expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
+        for i in range(channel):
+            expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
+        actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, e_absorption)
+        self.assertEqual(expected, actual, atol=1e-3, rtol=2)
@@ -1,7 +1,7 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl
+from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl, TorchScriptConsistencyTestRIRImpl
 
 
 class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
@@ -12,3 +12,13 @@ class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, Pytor
 class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
     dtype = torch.float64
     device = torch.device("cpu")
+
+
+class TorchScriptConsistencyRIRCPUFloat32Test(TorchScriptConsistencyTestRIRImpl, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device("cpu")
+
+
+class TorchScriptConsistencyRIRCPUFloat64Test(TorchScriptConsistencyTestRIRImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
@@ -48,3 +48,53 @@ def test_add_noise(self):
         snr = torch.rand(*leading_dims, dtype=self.dtype, device=self.device, requires_grad=True) * 10
 
         self._assert_consistency(F.add_noise, (waveform, noise, lengths, snr))
+
+
+class TorchScriptConsistencyTestRIRImpl(TestBaseMixin):
+    def _assert_consistency(self, func, inputs, shape_only=False):
+        inputs_ = []
+        for i in inputs:
+            if torch.is_tensor(i):
+                i = i.to(device=self.device, dtype=self.dtype)
+            inputs_.append(i)
+        ts_func = torch_script(func)
+
+        torch.random.manual_seed(40)
+        output = func(*inputs_)
+
+        torch.random.manual_seed(40)
+        ts_output = ts_func(*inputs_)
+
+        if shape_only:
+            ts_output = ts_output.shape
+            output = output.shape
+        self.assertEqual(ts_output, output)
+
+    @parameterized.expand([(2, 1), (3, 4)])
+    def test_simulate_rir_ism_single_band(self, D, channel):
+        room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        e_absorption = 0.5
+        center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
+        self._assert_consistency(
+            F.simulate_rir_ism,
+            (room_dim, source, mic_array, max_order, e_absorption, None, 81, center_frequency, 343.0, 16000.0),
+        )
+
+    @parameterized.expand([(2, 1), (3, 4)])
+    def test_simulate_rir_ism_multi_band(self, D, channel):
+        room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        if D == 2:
+            e_absorption = torch.rand(7, 4, dtype=self.dtype, device=self.device)
+        else:
+            e_absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
+        center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
+        self._assert_consistency(
+            F.simulate_rir_ism,
+            (room_dim, source, mic_array, max_order, e_absorption, None, 81, center_frequency, 343.0, 16000.0),
+        )
@@ -53,6 +53,14 @@ if(BUILD_RNNT)
   endif()
 endif()
 
+if(BUILD_RIR)
+  list(
+    APPEND
+    LIBTORCHAUDIO_SOURCES
+    build_rir.cpp
+    )
+endif()
+
 if(USE_CUDA)
   list(
     APPEND