Skip to content

Add some transforms #122

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions test/test_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,37 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s
_test_librosa_consistency_helper(**kwargs2)
_test_librosa_consistency_helper(**kwargs3)

def test_random_opposite(self):
audio_orig = self.sig.clone()

audio_flipped = transforms.RandomOpposite(probability=0)(audio_orig)
self.assertTrue(torch.allclose(audio_flipped, -audio_orig, atol=5e-3))

audio_flipped = transforms.RandomOpposite(probability=1)(audio_orig)
self.assertTrue(torch.allclose(audio_flipped, audio_orig, atol=5e-3))

def test_random_stretch(self):
audio_orig = self.sig.clone().transpose(0, 1)

audio_stretched = transforms.RandomStrech(max_factor=1)(audio_orig)
self.assertTrue(torch.allclose(audio_stretched, audio_orig, atol=5e-3))

audio_stretched = transforms.RandomStrech(max_factor=2)(audio_orig)
self.assertNotEqual(audio_stretched.size(1), audio_orig.size(1))
# False if random resturns one... Unlikely

def test_random_crop(self):
audio_orig = self.sig.clone().transpose(0, 1)

croped_audio = transforms.RandomCrop(200)(audio_orig)
self.assertEqual(croped_audio.size(1), 200)

def test_pad(self):
audio_orig = self.sig.clone().transpose(0, 1)

padded_audio = transforms.Pad(200, 0)(audio_orig)
self.assertEqual(padded_audio.size(1), audio_orig.size(1) + 200 * 2)


if __name__ == '__main__':
unittest.main()
125 changes: 124 additions & 1 deletion torchaudio/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@
'MFCC',
'BLC2CBL',
'mu_law_encoding',
'mu_law_expanding'
'mu_law_expanding',
'crop_in_between',
'random_crop',
'stretch',
'random_stretch',
'opposite',
'random_opposite'
]


Expand Down Expand Up @@ -350,3 +356,120 @@ def mu_law_expanding(x_mu, qc):
x = ((x_mu) / mu) * 2 - 1.
x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.) / mu
return x


def crop_in_between(tensor, start, end, ch_dim):
"""Crops a piece of tensor

Args:
tensor (Tensor): Tensor of audio of size (NxC) or (CxN)
start (int): Starting point of crop
end (int): Ending point of crop
ch_dim (int): Dimension of channel (not size)

Returns:
Tensor: a piece of the tensor
"""
if ch_dim == 1:
tensor = tensor.transpose(0, 1)

tensor = tensor[:, start: end]

if ch_dim == 1:
tensor = tensor.transpose(0, 1)

return tensor


def random_crop(tensor, size, ch_dim):
"""Randomly crops a piece of tensor

Args:
tensor (Tensor): Tensor of audio of size (NxC) or (CxN)
start (int): Starting point of crop
end (int): Ending point of crop
ch_dim (int): Dimension of channel (not size)

Returns:
Tensor: a piece of the tensor
"""
orig_size = tensor.size(1 - ch_dim)
start = torch.randint(0, orig_size - size, (1,))
end = start + size
return crop_in_between(tensor, start.item(), end.item(), ch_dim)


def stretch(tensor, factor, interpolate, ch_dim):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this similar to Resample or phase_vocoder?

"""Strech a tensor on the time dimention (not the channel one) with
the given factor.

Args:
tensor (Tensor): Tensor of audio of size (n x c) or (c x n)
factor (Tensor, float): Streching factor of the tensor
interpolate (str): mode of interpolation for the generated audio
points (linear or nearest)
ch_dim (int): Dimension of channel (not size)

Returns:
Tensor : the stretched tensor
"""
type_orig = tensor.type()
if ch_dim == 1:
tensor = tensor.transpose(0, 1)

# Generate list of factor indexes
output_size = (tensor.size(1) * factor).float()
ref = torch.arange(output_size.item()) / factor

# Select interpolation type
if interpolate.lower() == 'linear':
ref1 = ref.floor().float()
ref2 = torch.clamp_max(ref1 + 1, tensor.size(1) - 1)
r = (ref - ref1).type(type_orig) # Ratio of sound[ref] to use
stretched_sound = (tensor[:, ref1.long()] * (1 - r) +
tensor[:, ref2.long()] * r)
elif interpolate.lower() == 'nearest':
ref = ref.int() # Nearest index
stretched_sound = tensor[ref.long()]
else:
raise Exception('Invalid interpolation mode {}'.format(
interpolate))

if ch_dim == 1:
stretched_sound = stretched_sound.transpose(0, 1)

return stretched_sound


def random_stretch(tensor, max_factor, interpolate, ch_dim):
"""Strech a tensor on the time dimention (not the channel one) with
a random factor.

Args:
tensor (Tensor): Tensor of audio of size (n x c) or (c x n)
max_factor (float): Max stretching factor of the tensor
interpolate (str): Mode of interpolation for the generated audio
points (linear or nearest)
ch_dim (int): Dimension of channel (not size)

Returns:
Tensor : the stretched tensor
"""
factor = max_factor ** (torch.rand(1) * 2 - 1)
return stretch(tensor, factor, interpolate, ch_dim)


def opposite(tensor):
"""Returns the opposite value of the tensor
"""
return -tensor


def random_opposite(tensor, probability):
"""Ramdomly return the opposite values of the tensor
"""
do_it = (torch.rand(1) >= probability)
if do_it:
tensor = opposite(tensor)

return tensor
159 changes: 159 additions & 0 deletions torchaudio/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
from . import functional as F


def _check_audio(tensor):
if not isinstance(tensor, nn.Tensor):
raise TypeError('tensor should be a torch tensor')
if len(tensor.size()) > 2:
raise TypeError(('tensor representing audio should be at most ',
'2Dimentional'))


class Compose(object):
"""Composes several transforms together.

Expand Down Expand Up @@ -444,3 +452,154 @@ def __call__(self, x_mu):

def __repr__(self):
return self.__class__.__name__ + '()'


class Pad(object):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This class seems very similar to PadTrim (

class PadTrim(object):
).
Is there any difference?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does not look like. They bot end up using the same torch.functional.pad function.

"""Pad the given tensor on all sides with specified padding fill value

Args:
padding (int or tuple): Padding on each border. If a single int is
provided this is used to pad all borders. If tuple of length 2
is provided this is the padding on left/right.
fill: fill value.
channels_first (bool): Channel is first and time second. Default: `True`
"""
def __init__(self, padding, fill=0, channel_first=True):
self.padding = padding
self.fill = fill
self.ch_dim = int(not channel_first)

def __call__(self, tensor):
"""
Args:
tensor (Tensor): Audio of size (Samples x Channels) or (C x S)

Returns:
tensor (Tensor): A tensor padded right and/or left with fill value
"""
if self.ch_dim == 1:
tensor = tensor.transpose(0, 1)

tensor = torch.nn.ConstantPad1d(self.padding, self.fill)(tensor)

if self.ch_dim == 1:
tensor = tensor.transpose(0, 1)

return tensor


class RandomCrop(object):
"""Randomly crops a piece of tensor

Args:
size (int): size of the crop to retrieve
channels_first (bool): Channel is first and time second. Default: `True`
"""
def __init__(self, size, channel_first=True):
self.size = size
self.ch_dim = int(not channel_first)

def __call__(self, tensor):
"""
Args:
tensor (Tensor): Audio of size (SxC) or (CxS)

Returns:
Tensor: A tensor randomly steched by a factor on the sample axis.
"""
return F.random_crop(tensor, self.size, self.ch_dim)


class RandomStretch(object):
"""Randomly stretch or shrink audio

Args:
max_factor (float): Stretching factor of the audio
interpolate (str): mode of interpolation for the generated audio
points (linear or nearest)
channels_first (bool): Channel is first and time second. Default: `True`
"""
def __init__(self, max_factor=1.3, interpolate='Linear', channel_first=True):
self.max_factor = max_factor
self.interpolate = interpolate
self.ch_dim = int(not channel_first)

def __call__(self, tensor):
"""
Args:
tensor (Tensor): Audio of size (Samples x Channels) or (C x S)

Returns:
Tensor: A tensor randomly stetched by a factor on the sample axis.
"""
return F.random_stretch(tensor,
self.max_factor,
self.interpolate,
self.ch_dim)


class RandomOpposite(object):
"""Randomly retrive the opposite values of $tensor$

Args:
tensor (Tensor): signal tensor with shape (size, channels)
probability (float): Probability for a flip to happen.

"""
def __init__(self, probability=0.5):
self.probability = probability

def __call__(self, tensor):
return F.random_opposite(tensor, self.probability)


class AddChannelDimension(object):
"""Add a channel dimension if missing. This result in a two dimensional
Tensor

Args:
tensor (Tensor): signal tensor with shape (size, channels)
channels_first (bool): Channel is first and time second. Default: `True`
"""
def __init__(self, channel_first):
self.ch_dim = int(not channel_first)

def __call__(self, tensor):
if len(tensor.shape) == 1:
tensor = tensor.unsqueeze(self.ch_dim)
return tensor


class AddDimension(object):
"""Add a dimension to a Tensor to fit desired model.
eg: add dimension to fit 2D and 3D convolutions.
Tensor

Args:
tensor (Tensor): signal tensor with shape (size, channels)
dimension (int): The dimesion to create
"""
def __init__(self, dimension):
self.dim = int(dimension)

def __call__(self, tensor):
tensor = tensor.unsqueeze(self.dim)
return tensor


class ToTensor(object):
"""Convert a ``numpy.ndarray`` to tensor.
"""

def __call__(self, array):
"""
Args:
array: a numpy array or array to be converted

Returns:
Tensor: Converted sound.
"""
return torch.tensor(array)

def __repr__(self):
return self.__class__.__name__ + '()'