pytorch · NicolasHug · Nov 10, 2022 · Nov 11, 2022 · Nov 12, 2022 · Nov 12, 2022
@@ -72,7 +72,7 @@ fi
 (
     set -x
     conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa>=0.8.0' parameterized 'requests>=2.20'
-    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag
+    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

@@ -90,7 +90,8 @@ esac
         unidecode \
         'protobuf<4.21.0' \
         demucs \
-        tinytag
+        tinytag \
+        pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

@@ -58,6 +58,7 @@ endif()
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
 option(BUILD_KALDI "Build kaldi statically" ON)
+option(BUILD_RAY_TRACING "Enable ray tracing simulation" ON)  # TODO: REMOVE THIS
 option(BUILD_RNNT "Enable RNN transducer" ON)
 option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON)
 option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)

@@ -50,3 +50,8 @@ DSP
    extend_pitch
    oscillator_bank
    sinc_impulse_response
+
+ray_tracing
+~~~~~~~~~~~
+
+.. autofunction:: ray_tracing
@@ -464,6 +464,14 @@ @inproceedings{GigaSpeech2021
   year=2021,
   author={Guoguo Chen and Shuzhou Chai and Guanbo Wang and Jiayu Du and Wei-Qiang Zhang and Chao Weng and Dan Su and Daniel Povey and Jan Trmal and Junbo Zhang and Mingjie Jin and Sanjeev Khudanpur and Shinji Watanabe and Shuaijiang Zhao and Wei Zou and Xiangang Li and Xuchen Yao and Yongqing Wang and Yujun Wang and Zhao You and Zhiyong Yan}
 }
+@inproceedings{scheibler2018pyroomacoustics,
+  title={Pyroomacoustics: A python package for audio room simulation and array processing algorithms},
+  author={Scheibler, Robin and Bezzam, Eric and Dokmani{\'c}, Ivan},
+  booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
+  pages={351--355},
+  year={2018},
+  organization={IEEE}
+}
 @inproceedings{ko15_interspeech,
   author={Tom Ko and Vijayaditya Peddinti and Daniel Povey and Sanjeev Khudanpur},
   title={{Audio augmentation for speech recognition}},

@@ -1,7 +1,7 @@
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
 
-from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl
+from .functional_test_impl import Functional64OnlyTestImpl, FunctionalCPUOnlyTestImpl, FunctionalTestImpl
 
 
 class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
@@ -17,3 +17,13 @@ class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase):
 class FunctionalFloat64OnlyCPUTest(Functional64OnlyTestImpl, PytorchTestCase):
     dtype = torch.float64
     device = torch.device("cpu")
+
+
+class FunctionalFloat32CPUOnlyTest(FunctionalCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device("cpu")
+
+
+class FunctionalFloat64CPUOnlyTest(FunctionalCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
@@ -5,11 +5,15 @@
 import torchaudio.prototype.functional as F
 from parameterized import param, parameterized
 from scipy import signal
+from torchaudio._internal import module_utils as _mod_utils
 from torchaudio.functional import lfilter
-from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
+from torchaudio_unittest.common_utils import nested_params, skipIfNoModule, TestBaseMixin
 
 from .dsp_utils import oscillator_bank as oscillator_bank_np, sinc_ir as sinc_ir_np
 
+if _mod_utils.is_module_available("pyroomacoustics"):
+    import pyroomacoustics as pra
+
 
 def _prod(l):
     r = 1
@@ -518,3 +522,306 @@ def _debug_plot():
         except AssertionError:
             _debug_plot()
             raise
+
+
+class FunctionalCPUOnlyTestImpl(TestBaseMixin):
+    @parameterized.expand(
+        [
+            (0.1, 0.2, (2, 1, 2500)),  # both float
+            # Per-wall
+            (torch.rand(4), 0.2, (2, 1, 2500)),
+            (0.1, torch.rand(4), (2, 1, 2500)),
+            (torch.rand(4), torch.rand(4), (2, 1, 2500)),
+            # Per-band and per-wall
+            (torch.rand(6, 4), 0.2, (2, 6, 2500)),
+            (0.1, torch.rand(6, 4), (2, 6, 2500)),
+            (torch.rand(6, 4), torch.rand(6, 4), (2, 6, 2500)),
+        ]
+    )
+    def test_ray_tracing_output_shape(self, absorption, scattering, expected_shape):
+        room_dim = torch.tensor([20, 25], dtype=self.dtype)
+        mic_array = torch.tensor([[2, 2], [8, 8]], dtype=self.dtype)
+        source = torch.tensor([7, 6], dtype=self.dtype)
+        num_rays = 100
+
+        hist = F.ray_tracing(
+            room=room_dim,
+            source=source,
+            mic_array=mic_array,
+            num_rays=num_rays,
+            absorption=absorption,
+            scattering=scattering,
+        )
+
+        assert hist.shape == expected_shape
+
+    def test_ray_tracing_input_errors(self):
+        with self.assertRaisesRegex(ValueError, "room must be a 1D tensor"):
+            F.ray_tracing(
+                room=torch.tensor([[4, 5]]), source=torch.tensor([0, 0]), mic_array=torch.tensor([[3, 4]]), num_rays=10
+            )
+        with self.assertRaisesRegex(ValueError, "room must be a 1D tensor"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5, 4, 5]),
+                source=torch.tensor([0, 0]),
+                mic_array=torch.tensor([[3, 4]]),
+                num_rays=10,
+            )
+        with self.assertRaisesRegex(ValueError, r"mic_array must be 1D tensor of shape \(D,\), or 2D tensor"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5]), source=torch.tensor([0, 0]), mic_array=torch.tensor([[[3, 4]]]), num_rays=10
+            )
+        with self.assertRaisesRegex(ValueError, "room must be of float32 or float64 dtype"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5]).to(torch.int),
+                source=torch.tensor([0, 0]),
+                mic_array=torch.tensor([3, 4]),
+                num_rays=10,
+            )
+        with self.assertRaisesRegex(ValueError, "dtype of room, source and mic_array must be the same"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5]).to(torch.float64),
+                source=torch.tensor([0, 0]).to(torch.float32),
+                mic_array=torch.tensor([3, 4]),
+                num_rays=10,
+            )
+        with self.assertRaisesRegex(ValueError, "Room dimension D must match with source and mic_array"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5, 10], dtype=torch.float),
+                source=torch.tensor([0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+            )
+        with self.assertRaisesRegex(ValueError, "Room dimension D must match with source and mic_array"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5], dtype=torch.float),
+                source=torch.tensor([0, 0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+            )
+        with self.assertRaisesRegex(ValueError, "Room dimension D must match with source and mic_array"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5, 10], dtype=torch.float),
+                source=torch.tensor([0, 0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+            )
+        with self.assertRaisesRegex(ValueError, "time_thres=10 must be at least greater than hist_bin_size=11"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5], dtype=torch.float),
+                source=torch.tensor([0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+                time_thres=10,
+                hist_bin_size=11,
+            )
+        with self.assertRaisesRegex(ValueError, "The shape of absorption must be"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5], dtype=torch.float),
+                source=torch.tensor([0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+                absorption=torch.rand(5, dtype=torch.float),
+            )
+        with self.assertRaisesRegex(ValueError, "The shape of scattering must be"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5], dtype=torch.float),
+                source=torch.tensor([0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+                scattering=torch.rand(5, 5, dtype=torch.float),
+            )
+        with self.assertRaisesRegex(ValueError, "The shape of absorption must be"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5], dtype=torch.float),
+                source=torch.tensor([0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+                absorption=torch.rand(5, 5, dtype=torch.float),
+            )
+        with self.assertRaisesRegex(ValueError, "The shape of scattering must be"):
+            F.ray_tracing(
+                room=torch.tensor([4, 5], dtype=torch.float),
+                source=torch.tensor([0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+                scattering=torch.rand(5, dtype=torch.float),
+            )
+        with self.assertRaisesRegex(
+            ValueError, "absorption and scattering must have the same number of bands and walls"
+        ):
+            F.ray_tracing(
+                room=torch.tensor([4, 5], dtype=torch.float),
+                source=torch.tensor([0, 0], dtype=torch.float),
+                mic_array=torch.tensor([3, 4], dtype=torch.float),
+                num_rays=10,
+                absorption=torch.rand(6, 4, dtype=torch.float),
+                scattering=torch.rand(5, 4, dtype=torch.float),
+            )
+
+        # Make sure passing different shapes for absorption or scattering doesn't raise an error
+        # float and tensor
+        F.ray_tracing(
+            room=torch.tensor([4, 5], dtype=torch.float),
+            source=torch.tensor([0, 0], dtype=torch.float),
+            mic_array=torch.tensor([3, 4], dtype=torch.float),
+            num_rays=10,
+            absorption=0.1,
+            scattering=torch.rand(5, 4, dtype=torch.float),
+        )
+        F.ray_tracing(
+            room=torch.tensor([4, 5], dtype=torch.float),
+            source=torch.tensor([0, 0], dtype=torch.float),
+            mic_array=torch.tensor([3, 4], dtype=torch.float),
+            num_rays=10,
+            absorption=torch.rand(5, 4, dtype=torch.float),
+            scattering=0.1,
+        )
+        # per-wall only and per-band + per-wall
+        F.ray_tracing(
+            room=torch.tensor([4, 5], dtype=torch.float),
+            source=torch.tensor([0, 0], dtype=torch.float),
+            mic_array=torch.tensor([3, 4], dtype=torch.float),
+            num_rays=10,
+            absorption=torch.rand(4, dtype=torch.float),
+            scattering=torch.rand(6, 4, dtype=torch.float),
+        )
+        F.ray_tracing(
+            room=torch.tensor([4, 5], dtype=torch.float),
+            source=torch.tensor([0, 0], dtype=torch.float),
+            mic_array=torch.tensor([3, 4], dtype=torch.float),
+            num_rays=10,
+            absorption=torch.rand(6, 4, dtype=torch.float),
+            scattering=torch.rand(4, dtype=torch.float),
+        )
+
+    def test_ray_tracing_per_band_per_wall_absorption(self):
+        """Check that when the value of absorption and scattering are the same
+        across walls and frequency bands, the output histograms are:
+        - all equal across frequency bands
+        - equal to simply passing a float value instead of a (num_bands, D) or
+        (D,) tensor.
+        """
+
+        room_dim = torch.tensor([20, 25], dtype=self.dtype)
+        mic_array = torch.tensor([[2, 2], [8, 8]], dtype=self.dtype)
+        source = torch.tensor([7, 6], dtype=self.dtype)
+        num_rays = 1_000
+        ABS, SCAT = 0.1, 0.2
+
+        absorption = torch.full(fill_value=ABS, size=(6, 4), dtype=self.dtype)
+        scattering = torch.full(fill_value=SCAT, size=(6, 4), dtype=self.dtype)
+        hist_per_band_per_wall = F.ray_tracing(
+            room=room_dim,
+            source=source,
+            mic_array=mic_array,
+            num_rays=num_rays,
+            absorption=absorption,
+            scattering=scattering,
+        )
+        absorption = torch.full(fill_value=ABS, size=(4,), dtype=self.dtype)
+        scattering = torch.full(fill_value=SCAT, size=(4,), dtype=self.dtype)
+        hist_per_wall = F.ray_tracing(
+            room=room_dim,
+            source=source,
+            mic_array=mic_array,
+            num_rays=num_rays,
+            absorption=absorption,
+            scattering=scattering,
+        )
+
+        absorption = ABS
+        scattering = SCAT
+        hist_single = F.ray_tracing(
+            room=room_dim,
+            source=source,
+            mic_array=mic_array,
+            num_rays=num_rays,
+            absorption=absorption,
+            scattering=scattering,
+        )
+        assert hist_per_band_per_wall.shape == (2, 6, 2500)
+        assert hist_per_wall.shape == (2, 1, 2500)
+        assert hist_single.shape == (2, 1, 2500)
+        torch.testing.assert_close(hist_single, hist_per_wall)
+
+        hist_single = hist_single.expand(2, 6, 2500)
+        torch.testing.assert_close(hist_single, hist_per_band_per_wall)
+
+    @skipIfNoModule("pyroomacoustics")
+    @parameterized.expand(
+        [
+            ([20, 25], [2, 2], [[8, 8], [7, 6]], 10_000),  # 2D with 2 mics
+            ([20, 25, 30], [1, 10, 5], [[8, 8, 22]], 1_000),  # 3D with 1 mic
+        ]
+    )
+    def test_ray_tracing_same_results_as_pyroomacoustics(self, room_dim, source, mic_array, num_rays):
+
+        walls = ["west", "east", "south", "north"]
+        if len(room_dim) == 3:
+            walls += ["floor", "ceiling"]
+        num_walls = len(walls)
+        num_bands = 6  # Note: in ray tracing, we don't need to restrict the number of bands to 7
+
+        absorption = torch.rand(num_bands, num_walls, dtype=self.dtype)
+        scattering = torch.rand(num_bands, num_walls, dtype=self.dtype)
+        energy_thres = 1e-7
+        time_thres = 10.0
+        hist_bin_size = 0.004
+        mic_radius = 0.5
+        sound_speed = 343.0
+
+        room_dim = torch.tensor(room_dim, dtype=self.dtype)
+        source = torch.tensor(source, dtype=self.dtype)
+        mic_array = torch.tensor(mic_array, dtype=self.dtype)
+
+        room = pra.ShoeBox(
+            room_dim.tolist(),
+            ray_tracing=True,
+            materials={
+                walls[i]: pra.Material(
+                    energy_absorption={
+                        "coeffs": absorption[:, i].reshape(-1).detach().numpy(),
+                        "center_freqs": 125 * 2 ** np.arange(num_bands),
+                    },
+                    scattering={
+                        "coeffs": scattering[:, i].reshape(-1).detach().numpy(),
+                        "center_freqs": 125 * 2 ** np.arange(num_bands),
+                    },
+                )
+                for i in range(num_walls)
+            },
+            air_absorption=False,
+            max_order=0,  # Make sure PRA doesn't use the hybrid method (we just want ray tracing)
+        )
+        room.add_microphone_array(mic_array.T.tolist())
+        room.add_source(source.tolist())
+        room.set_ray_tracing(
+            n_rays=num_rays,
+            energy_thres=energy_thres,
+            time_thres=time_thres,
+            hist_bin_size=hist_bin_size,
+            receiver_radius=mic_radius,
+        )
+        room.set_sound_speed(sound_speed)
+
+        room.compute_rir()
+        hist_pra = torch.tensor(np.array(room.rt_histograms))[:, 0, 0]
+
+        hist = F.ray_tracing(
+            room=room_dim,
+            source=source,
+            mic_array=mic_array,
+            num_rays=num_rays,
+            absorption=absorption,
+            scattering=scattering,
+            sound_speed=sound_speed,
+            mic_radius=mic_radius,
+            energy_thres=energy_thres,
+            time_thres=time_thres,
+            hist_bin_size=hist_bin_size,
+        )
+
+        assert hist.ndim == 3
+        assert hist.shape == hist_pra.shape
+        self.assertEqual(hist.to(torch.float32), hist_pra)