Skip to content

Commit 3933eee

Browse files
committed
implement standardization.
1 parent 02cf5f6 commit 3933eee

File tree

1 file changed

+30
-52
lines changed

1 file changed

+30
-52
lines changed

beginner_source/audio_preprocessing_tutorial.py

Lines changed: 30 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,15 @@
2929

3030

3131
######################################################################
32-
# Torchaudio supports loading sound files in the wav and mp3 format.
32+
# Torchaudio supports loading sound files in the wav and mp3 format. We
33+
# call waveform the resulting raw audio signal.
3334
#
3435

3536
filename = "_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav"
36-
waveform, frequency = torchaudio.load(filename)
37+
waveform, sample_rate = torchaudio.load(filename)
3738

3839
print("Shape of waveform: {}".format(waveform.size()))
39-
print("Frequency of waveform: {}".format(frequency))
40+
print("Sample rate of waveform: {}".format(sample_rate))
4041

4142
plt.figure()
4243
plt.plot(waveform.transpose(0,1).numpy())
@@ -53,23 +54,18 @@
5354
# FloatTensor) to a floating point number between -1.0 and 1.0. Note
5455
# the 16-bit number is called the “bit depth” or “precision”, not to be
5556
# confused with “bit rate”.
56-
# - **PadTrim**: PadTrim a 2d-Tensor
57-
# - **Downmix**: Downmix any stereo signals to mono.
58-
# - **LC2CL**: Permute a 2d tensor from samples (n x c) to (c x n).
59-
# - **Resample**: Resample the signal to a different frequency.
60-
# - **Spectrogram**: Create a spectrogram from a raw audio signal
61-
# - **MelScale**: This turns a normal STFT into a mel frequency STFT,
62-
# using a conversion matrix. This uses triangular filter banks.
57+
# - **Resample**: Resample waveform to a different sample rate.
58+
# - **Spectrogram**: Create a spectrogram from a waveform.
59+
# - **MelScale**: This turns a normal STFT into a Mel-frequency STFT,
60+
# using a conversion matrix.
6361
# - **SpectrogramToDB**: This turns a spectrogram from the
6462
# power/amplitude scale to the decibel scale.
65-
# - **MFCC**: Create the Mel-frequency cepstrum coefficients from an
66-
# audio signal
67-
# - **MelSpectrogram**: Create MEL Spectrograms from a raw audio signal
68-
# using the STFT function in PyTorch.
69-
# - **BLC2CBL**: Permute a 3d tensor from Bands x Sample length x
70-
# Channels to Channels x Bands x Samples length.
71-
# - **MuLawEncoding**: Encode signal based on mu-law companding.
72-
# - **MuLawExpanding**: Decode mu-law encoded signal.
63+
# - **MFCC**: Create the Mel-frequency cepstrum coefficients from a
64+
# waveform.
65+
# - **MelSpectrogram**: Create MEL Spectrograms from a waveform using the
66+
# STFT function in PyTorch.
67+
# - **MuLawEncoding**: Encode waveform based on mu-law companding.
68+
# - **MuLawDeconding**: Decode mu-law encoded waveform.
7369
#
7470
# Since all transforms are nn.Modules or jit.ScriptModules, they can be
7571
# used as part of a neural network at any point.
@@ -85,7 +81,7 @@
8581
print("Shape of spectrogram: {}".format(specgram.size()))
8682

8783
plt.figure()
88-
plt.imshow(specgram.log2().transpose(1,2)[0,:,:].numpy(), cmap='gray')
84+
plt.imshow(specgram.log2()[0,:,:].numpy(), cmap='gray')
8985

9086

9187
######################################################################
@@ -97,36 +93,18 @@
9793
print("Shape of spectrogram: {}".format(specgram.size()))
9894

9995
plt.figure()
100-
p = plt.imshow(specgram.log2().transpose(1,2)[0,:,:].detach().numpy(), cmap='gray')
96+
p = plt.imshow(specgram.log2()[0,:,:].detach().numpy(), cmap='gray')
10197

10298

10399
######################################################################
104-
# We can resample the signal, one channel at a time.
100+
# We can resample the waveform, one channel at a time.
105101
#
106102

107-
new_frequency = frequency/10
103+
new_sample_rate = sample_rate/10
108104

109105
# Since Resample applies to a single channel, we resample first channel here
110106
channel = 0
111-
transformed = torchaudio.transforms.Resample(frequency, new_frequency)(waveform[channel,:].view(1,-1))
112-
113-
print("Shape of transformed waveform: {}".format(transformed.size()))
114-
115-
plt.figure()
116-
plt.plot(transformed[0,:].numpy())
117-
118-
119-
######################################################################
120-
# Or we can first convert the stereo to mono, and resample, using
121-
# composition.
122-
#
123-
124-
transformed = torchaudio.transforms.Compose([
125-
torchaudio.transforms.LC2CL(),
126-
torchaudio.transforms.DownmixMono(),
127-
torchaudio.transforms.LC2CL(),
128-
torchaudio.transforms.Resample(frequency, new_frequency)
129-
])(waveform)
107+
transformed = torchaudio.transforms.Resample(sample_rate, new_sample_rate)(waveform[channel,:].view(1,-1))
130108

131109
print("Shape of transformed waveform: {}".format(transformed.size()))
132110

@@ -136,8 +114,8 @@
136114

137115
######################################################################
138116
# As another example of transformations, we can encode the signal based on
139-
# the Mu-Law companding. But to do so, we need the signal to be between -1
140-
# and 1. Since the tensor is just a regular PyTorch tensor, we can apply
117+
# Mu-Law enconding. But to do so, we need the signal to be between -1 and
118+
# 1. Since the tensor is just a regular PyTorch tensor, we can apply
141119
# standard operators on it.
142120
#
143121

@@ -175,7 +153,7 @@ def normalize(tensor):
175153
# And now decode.
176154
#
177155

178-
reconstructed = torchaudio.transforms.MuLawExpanding()(transformed)
156+
reconstructed = torchaudio.transforms.MuLawDecoding()(transformed)
179157

180158
print("Shape of recovered waveform: {}".format(reconstructed.size()))
181159

@@ -216,7 +194,7 @@ def normalize(tensor):
216194
#
217195

218196
n_fft = 400.0
219-
frame_length = n_fft / frequency * 1000.0
197+
frame_length = n_fft / sample_rate * 1000.0
220198
frame_shift = frame_length / 2.0
221199

222200
params = {
@@ -227,7 +205,7 @@ def normalize(tensor):
227205
"frame_shift": frame_shift,
228206
"remove_dc_offset": False,
229207
"round_to_power_of_two": False,
230-
"sample_frequency": frequency,
208+
"sample_frequency": sample_rate,
231209
}
232210

233211
specgram = torchaudio.compliance.kaldi.spectrogram(waveform, **params)
@@ -239,7 +217,7 @@ def normalize(tensor):
239217

240218

241219
######################################################################
242-
# We also support computing the filterbank features from raw audio signal,
220+
# We also support computing the filterbank features from waveforms,
243221
# matching Kaldi’s implementation.
244222
#
245223

@@ -255,9 +233,9 @@ def normalize(tensor):
255233
# Conclusion
256234
# ----------
257235
#
258-
# We used an example sound signal to illustrate how to open an audio file
259-
# or using Torchaudio, and how to pre-process and transform an audio
260-
# signal. Given that Torchaudio is built on PyTorch, these techniques can
261-
# be used as building blocks for more advanced audio applications, such as
262-
# speech recognition, while leveraging GPUs.
236+
# We used an example raw audio signal, or waveform, to illustrate how to
237+
# open an audio file using Torchaudio, and how to pre-process and
238+
# transform such waveform. Given that Torchaudio is built on PyTorch,
239+
# these techniques can be used as building blocks for more advanced audio
240+
# applications, such as speech recognition, while leveraging GPUs.
263241
#

0 commit comments

Comments
 (0)