Skip to content

Add simulate_rir_ism method for simulating RIR with Image Source Method #2644

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/unittest/linux/scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ fi
(
set -x
conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa>=0.8.0' parameterized 'requests>=2.20'
pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag
pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics
)
# Install fairseq
git clone https://github.com/pytorch/fairseq
Expand Down
3 changes: 2 additions & 1 deletion .circleci/unittest/windows/scripts/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ esac
unidecode \
'protobuf<4.21.0' \
demucs \
tinytag
tinytag \
pyroomacoustics
)
# Install fairseq
git clone https://github.com/pytorch/fairseq
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ endif()
# Options
option(BUILD_SOX "Build libsox statically" ON)
option(BUILD_KALDI "Build kaldi statically" ON)
option(BUILD_RIR "Enable RIR simulation" ON)
option(BUILD_RNNT "Enable RNN transducer" ON)
option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON)
option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)
Expand Down
5 changes: 5 additions & 0 deletions docs/source/prototype.functional.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,8 @@ fftconvolve
~~~~~~~~~~~

.. autofunction:: fftconvolve

simulate_rir_ism
~~~~~~~~~~~~~~~~

.. autofunction:: simulate_rir_ism
8 changes: 8 additions & 0 deletions docs/source/refs.bib
Original file line number Diff line number Diff line change
Expand Up @@ -423,3 +423,11 @@ @article{CATTONI2021101155
keywords = {Spoken language translation, Multilingual corpus},
abstract = {End-to-end spoken language translation (SLT) has recently gained popularity thanks to the advancement of sequence to sequence learning in its two parent tasks: automatic speech recognition (ASR) and machine translation (MT). However, research in the field has to confront with the scarcity of publicly available corpora to train data-hungry neural networks. Indeed, while traditional cascade solutions can build on sizable ASR and MT training data for a variety of languages, the available SLT corpora suitable for end-to-end training are few, typically small and of limited language coverage. We contribute to fill this gap by presenting MuST-C, a large and freely available Multilingual Speech Translation Corpus built from English TED Talks. Its unique features include: i) language coverage and diversity (from English into 14 languages from different families), ii) size (at least 237 hours of transcribed recordings per language, 430 on average), iii) variety of topics and speakers, and iv) data quality. Besides describing the corpus creation methodology and discussing the outcomes of empirical and manual quality evaluations, we present baseline results computed with strong systems on each language direction covered by MuST-C.}
}
@inproceedings{scheibler2018pyroomacoustics,
title={Pyroomacoustics: A python package for audio room simulation and array processing algorithms},
author={Scheibler, Robin and Bezzam, Eric and Dokmani{\'c}, Ivan},
booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
pages={351--355},
year={2018},
organization={IEEE}
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import torch
from torchaudio_unittest.common_utils import PytorchTestCase

from .autograd_test_impl import AutogradTestImpl
from .autograd_test_impl import AutogradTestImpl, AutogradTestRIRImpl


class TestAutogradCPUFloat64(AutogradTestImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")


class TestAutogradRIRCPUFloat64(AutogradTestRIRImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,24 @@ def test_add_noise(self):

self.assertTrue(gradcheck(F.add_noise, (waveform, noise, lengths, snr)))
self.assertTrue(gradgradcheck(F.add_noise, (waveform, noise, lengths, snr)))


class AutogradTestRIRImpl(TestBaseMixin):
@parameterized.expand([(2, 1), (3, 4)])
def test_simulate_rir_ism(self, D, channel):
room = torch.rand(D, dtype=self.dtype, device=self.device, requires_grad=True)
mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device, requires_grad=True)
source = torch.rand(D, dtype=self.dtype, device=self.device, requires_grad=True)
max_order = 2
e_absorption = 0.5
output_length = 1000
self.assertTrue(
gradcheck(
F.simulate_rir_ism, (room, source, mic_array, max_order, e_absorption, output_length), atol=1e-3, rtol=1
)
)
self.assertTrue(
gradgradcheck(
F.simulate_rir_ism, (room, source, mic_array, max_order, e_absorption, output_length), atol=1e-3, rtol=1
)
)
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
from torchaudio_unittest.common_utils import PytorchTestCase

from .functional_test_impl import FunctionalTestImpl
from .functional_test_impl import FunctionalTestImpl, FunctionalTestRIRImpl


class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
Expand All @@ -12,3 +12,13 @@ class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")


class FunctionalRIRFloat32Test(FunctionalTestRIRImpl, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")


class FunctionalRIRFloat64Test(FunctionalTestRIRImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import numpy as np

try:
import pyroomacoustics as pra
Copy link
Collaborator

@mthrok mthrok Oct 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please follow

if _mod_utils.is_module_available("soundfile"):
import soundfile

except Exception:
pass
import torch
import torchaudio.prototype.functional as F
from parameterized import parameterized
from scipy import signal
from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
from torchaudio_unittest.common_utils import nested_params, skipIfNoModule, TestBaseMixin


class FunctionalTestImpl(TestBaseMixin):
Expand Down Expand Up @@ -107,3 +112,79 @@ def test_add_noise_length_check(self):

with self.assertRaisesRegex(ValueError, "Length dimensions"):
F.add_noise(waveform, noise, lengths, snr)


class FunctionalTestRIRImpl(TestBaseMixin):
@skipIfNoModule("pyroomacoustics")
@parameterized.expand([(2, 1), (3, 4)])
def test_simulate_rir_ism_single_band(self, D, channel):
"""Test simulate_rir_ism function in the case where absorption coefficients are identical for all walls."""
room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
max_order = 3
e_absorption = 0.5
room = pra.ShoeBox(
room_dim.detach().numpy(),
fs=16000,
materials=pra.Material(e_absorption),
max_order=max_order,
ray_tracing=False,
air_absorption=False,
)
mic_locs = np.asarray([mic_array[i].tolist() for i in range(channel)]).swapaxes(0, 1)
room.add_microphone_array(mic_locs)
room.add_source(source.tolist())
room.compute_rir()
max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
for i in range(channel):
expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, e_absorption)
self.assertEqual(expected, actual, atol=1e-3, rtol=2)

@skipIfNoModule("pyroomacoustics")
@parameterized.expand([(2, 1), (3, 4)])
def test_simulate_rir_ism_multi_band(self, D, channel):
"""Test simulate_rir_ism in the case where absorption coefficients are different for all walls."""
room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
max_order = 3
if D == 2:
e_absorption = torch.rand(7, 4, dtype=self.dtype, device=self.device)
walls = ["west", "east", "south", "north"]
else:
e_absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
walls = ["west", "east", "south", "north", "floor", "ceiling"]
room = pra.ShoeBox(
room_dim.detach().numpy(),
fs=16000,
materials={
walls[i]: pra.Material(
{
"coeffs": e_absorption[:, i]
.reshape(
-1,
)
.detach()
.numpy(),
"center_freqs": [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0],
}
)
for i in range(len(walls))
},
max_order=max_order,
ray_tracing=False,
air_absorption=False,
)
mic_locs = np.asarray([mic_array[i].tolist() for i in range(channel)]).swapaxes(0, 1)
room.add_microphone_array(mic_locs)
room.add_source(source.tolist())
room.compute_rir()
max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
for i in range(channel):
expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, e_absorption)
self.assertEqual(expected, actual, atol=1e-3, rtol=2)
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import torch
from torchaudio_unittest.common_utils import PytorchTestCase

from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl
from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl, TorchScriptConsistencyTestRIRImpl


class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
Expand All @@ -12,3 +12,13 @@ class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, Pytor
class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")


class TorchScriptConsistencyRIRCPUFloat32Test(TorchScriptConsistencyTestRIRImpl, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")


class TorchScriptConsistencyRIRCPUFloat64Test(TorchScriptConsistencyTestRIRImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,53 @@ def test_add_noise(self):
snr = torch.rand(*leading_dims, dtype=self.dtype, device=self.device, requires_grad=True) * 10

self._assert_consistency(F.add_noise, (waveform, noise, lengths, snr))


class TorchScriptConsistencyTestRIRImpl(TestBaseMixin):
def _assert_consistency(self, func, inputs, shape_only=False):
inputs_ = []
for i in inputs:
if torch.is_tensor(i):
i = i.to(device=self.device, dtype=self.dtype)
inputs_.append(i)
ts_func = torch_script(func)

torch.random.manual_seed(40)
output = func(*inputs_)

torch.random.manual_seed(40)
ts_output = ts_func(*inputs_)

if shape_only:
ts_output = ts_output.shape
output = output.shape
self.assertEqual(ts_output, output)

@parameterized.expand([(2, 1), (3, 4)])
def test_simulate_rir_ism_single_band(self, D, channel):
room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
max_order = 3
e_absorption = 0.5
center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
self._assert_consistency(
F.simulate_rir_ism,
(room_dim, source, mic_array, max_order, e_absorption, None, 81, center_frequency, 343.0, 16000.0),
)

@parameterized.expand([(2, 1), (3, 4)])
def test_simulate_rir_ism_multi_band(self, D, channel):
room_dim = torch.rand(D, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, D, dtype=self.dtype, device=self.device) + 1
source = torch.rand(D, dtype=self.dtype, device=self.device) + 4
max_order = 3
if D == 2:
e_absorption = torch.rand(7, 4, dtype=self.dtype, device=self.device)
else:
e_absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
self._assert_consistency(
F.simulate_rir_ism,
(room_dim, source, mic_array, max_order, e_absorption, None, 81, center_frequency, 343.0, 16000.0),
)
8 changes: 8 additions & 0 deletions torchaudio/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ if(BUILD_RNNT)
endif()
endif()

if(BUILD_RIR)
list(
APPEND
LIBTORCHAUDIO_SOURCES
build_rir.cpp
)
endif()

if(USE_CUDA)
list(
APPEND
Expand Down
Loading