29
29
30
30
31
31
######################################################################
32
- # Torchaudio supports loading sound files in the wav and mp3 format.
32
+ # Torchaudio supports loading sound files in the wav and mp3 format. We
33
+ # call waveform the resulting raw audio signal.
33
34
#
34
35
35
36
filename = "_static/img/steam-train-whistle-daniel_simon-converted-from-mp3.wav"
36
- waveform , frequency = torchaudio .load (filename )
37
+ waveform , sample_rate = torchaudio .load (filename )
37
38
38
39
print ("Shape of waveform: {}" .format (waveform .size ()))
39
- print ("Frequency of waveform: {}" .format (frequency ))
40
+ print ("Sample rate of waveform: {}" .format (sample_rate ))
40
41
41
42
plt .figure ()
42
43
plt .plot (waveform .transpose (0 ,1 ).numpy ())
53
54
# FloatTensor) to a floating point number between -1.0 and 1.0. Note
54
55
# the 16-bit number is called the “bit depth” or “precision”, not to be
55
56
# confused with “bit rate”.
56
- # - **PadTrim**: PadTrim a 2d-Tensor
57
- # - **Downmix**: Downmix any stereo signals to mono.
58
- # - **LC2CL**: Permute a 2d tensor from samples (n x c) to (c x n).
59
- # - **Resample**: Resample the signal to a different frequency.
60
- # - **Spectrogram**: Create a spectrogram from a raw audio signal
61
- # - **MelScale**: This turns a normal STFT into a mel frequency STFT,
62
- # using a conversion matrix. This uses triangular filter banks.
57
+ # - **Resample**: Resample waveform to a different sample rate.
58
+ # - **Spectrogram**: Create a spectrogram from a waveform.
59
+ # - **MelScale**: This turns a normal STFT into a Mel-frequency STFT,
60
+ # using a conversion matrix.
63
61
# - **SpectrogramToDB**: This turns a spectrogram from the
64
62
# power/amplitude scale to the decibel scale.
65
- # - **MFCC**: Create the Mel-frequency cepstrum coefficients from an
66
- # audio signal
67
- # - **MelSpectrogram**: Create MEL Spectrograms from a raw audio signal
68
- # using the STFT function in PyTorch.
69
- # - **BLC2CBL**: Permute a 3d tensor from Bands x Sample length x
70
- # Channels to Channels x Bands x Samples length.
71
- # - **MuLawEncoding**: Encode signal based on mu-law companding.
72
- # - **MuLawExpanding**: Decode mu-law encoded signal.
63
+ # - **MFCC**: Create the Mel-frequency cepstrum coefficients from a
64
+ # waveform.
65
+ # - **MelSpectrogram**: Create MEL Spectrograms from a waveform using the
66
+ # STFT function in PyTorch.
67
+ # - **MuLawEncoding**: Encode waveform based on mu-law companding.
68
+ # - **MuLawDeconding**: Decode mu-law encoded waveform.
73
69
#
74
70
# Since all transforms are nn.Modules or jit.ScriptModules, they can be
75
71
# used as part of a neural network at any point.
85
81
print ("Shape of spectrogram: {}" .format (specgram .size ()))
86
82
87
83
plt .figure ()
88
- plt .imshow (specgram .log2 (). transpose ( 1 , 2 ) [0 ,:,:].numpy (), cmap = 'gray' )
84
+ plt .imshow (specgram .log2 ()[0 ,:,:].numpy (), cmap = 'gray' )
89
85
90
86
91
87
######################################################################
97
93
print ("Shape of spectrogram: {}" .format (specgram .size ()))
98
94
99
95
plt .figure ()
100
- p = plt .imshow (specgram .log2 (). transpose ( 1 , 2 ) [0 ,:,:].detach ().numpy (), cmap = 'gray' )
96
+ p = plt .imshow (specgram .log2 ()[0 ,:,:].detach ().numpy (), cmap = 'gray' )
101
97
102
98
103
99
######################################################################
104
- # We can resample the signal , one channel at a time.
100
+ # We can resample the waveform , one channel at a time.
105
101
#
106
102
107
- new_frequency = frequency / 10
103
+ new_sample_rate = sample_rate / 10
108
104
109
105
# Since Resample applies to a single channel, we resample first channel here
110
106
channel = 0
111
- transformed = torchaudio .transforms .Resample (frequency , new_frequency )(waveform [channel ,:].view (1 ,- 1 ))
112
-
113
- print ("Shape of transformed waveform: {}" .format (transformed .size ()))
114
-
115
- plt .figure ()
116
- plt .plot (transformed [0 ,:].numpy ())
117
-
118
-
119
- ######################################################################
120
- # Or we can first convert the stereo to mono, and resample, using
121
- # composition.
122
- #
123
-
124
- transformed = torchaudio .transforms .Compose ([
125
- torchaudio .transforms .LC2CL (),
126
- torchaudio .transforms .DownmixMono (),
127
- torchaudio .transforms .LC2CL (),
128
- torchaudio .transforms .Resample (frequency , new_frequency )
129
- ])(waveform )
107
+ transformed = torchaudio .transforms .Resample (sample_rate , new_sample_rate )(waveform [channel ,:].view (1 ,- 1 ))
130
108
131
109
print ("Shape of transformed waveform: {}" .format (transformed .size ()))
132
110
136
114
137
115
######################################################################
138
116
# As another example of transformations, we can encode the signal based on
139
- # the Mu-Law companding . But to do so, we need the signal to be between -1
140
- # and 1. Since the tensor is just a regular PyTorch tensor, we can apply
117
+ # Mu-Law enconding . But to do so, we need the signal to be between -1 and
118
+ # 1. Since the tensor is just a regular PyTorch tensor, we can apply
141
119
# standard operators on it.
142
120
#
143
121
@@ -175,7 +153,7 @@ def normalize(tensor):
175
153
# And now decode.
176
154
#
177
155
178
- reconstructed = torchaudio .transforms .MuLawExpanding ()(transformed )
156
+ reconstructed = torchaudio .transforms .MuLawDecoding ()(transformed )
179
157
180
158
print ("Shape of recovered waveform: {}" .format (reconstructed .size ()))
181
159
@@ -216,7 +194,7 @@ def normalize(tensor):
216
194
#
217
195
218
196
n_fft = 400.0
219
- frame_length = n_fft / frequency * 1000.0
197
+ frame_length = n_fft / sample_rate * 1000.0
220
198
frame_shift = frame_length / 2.0
221
199
222
200
params = {
@@ -227,7 +205,7 @@ def normalize(tensor):
227
205
"frame_shift" : frame_shift ,
228
206
"remove_dc_offset" : False ,
229
207
"round_to_power_of_two" : False ,
230
- "sample_frequency" : frequency ,
208
+ "sample_frequency" : sample_rate ,
231
209
}
232
210
233
211
specgram = torchaudio .compliance .kaldi .spectrogram (waveform , ** params )
@@ -239,7 +217,7 @@ def normalize(tensor):
239
217
240
218
241
219
######################################################################
242
- # We also support computing the filterbank features from raw audio signal ,
220
+ # We also support computing the filterbank features from waveforms ,
243
221
# matching Kaldi’s implementation.
244
222
#
245
223
@@ -255,9 +233,9 @@ def normalize(tensor):
255
233
# Conclusion
256
234
# ----------
257
235
#
258
- # We used an example sound signal to illustrate how to open an audio file
259
- # or using Torchaudio, and how to pre-process and transform an audio
260
- # signal . Given that Torchaudio is built on PyTorch, these techniques can
261
- # be used as building blocks for more advanced audio applications, such as
262
- # speech recognition, while leveraging GPUs.
236
+ # We used an example raw audio signal, or waveform, to illustrate how to
237
+ # open an audio file using Torchaudio, and how to pre-process and
238
+ # transform such waveform . Given that Torchaudio is built on PyTorch,
239
+ # these techniques can be used as building blocks for more advanced audio
240
+ # applications, such as speech recognition, while leveraging GPUs.
263
241
#
0 commit comments