26
26
######################################################################
27
27
# Let’s check if a CUDA GPU is available and select our device. Running
28
28
# the network on a GPU will greatly decrease the training/testing runtime.
29
- #
29
+ #
30
30
31
31
device = torch .device ("cuda" if torch .cuda .is_available () else "cpu" )
32
32
print (device )
35
35
######################################################################
36
36
# Importing the Dataset
37
37
# ---------------------
38
- #
38
+ #
39
39
# We will use the UrbanSound8K dataset to train our network. It is
40
40
# available for free `here <https://urbansounddataset.weebly.com/>`_ and contains
41
41
# 10 audio classes with over 8000 audio samples! Once you have downloaded
42
42
# the compressed dataset, extract it to your current working directory.
43
43
# First, we will look at the csv file that provides information about the
44
44
# individual sound files. ``pandas`` allows us to open the csv file and
45
45
# use ``.iloc()`` to access the data within it.
46
- #
46
+ #
47
47
48
48
csvData = pd .read_csv ('./data/UrbanSound8K/metadata/UrbanSound8K.csv' )
49
49
print (csvData .iloc [0 , :])
55
55
# gun_shot, jackhammer, siren, and street_music. Let’s play a couple files
56
56
# and see what they sound like. The first file is street music and the
57
57
# second is an air conditioner.
58
- #
58
+ #
59
59
60
60
import IPython .display as ipd
61
61
ipd .Audio ('./data/UrbanSound8K/audio/fold1/108041-9-0-5.wav' )
66
66
######################################################################
67
67
# Formatting the Data
68
68
# -------------------
69
- #
69
+ #
70
70
# Now that we know the format of the csv file entries, we can construct
71
71
# our dataset. We will create a rapper class for our dataset using
72
72
# ``torch.utils.data.Dataset`` that will handle loading the files and
76
76
# class will store the file names, labels, and folder numbers of the audio
77
77
# files in the inputted folder list when initialized. The actual loading
78
78
# and formatting steps will happen in the access function ``__getitem__``.
79
- #
79
+ #
80
80
# In ``__getitem__``, we use ``torchaudio.load()`` to convert the wav
81
81
# files to tensors. ``torchaudio.load()`` returns a tuple containing the
82
82
# newly created tensor along with the sampling frequency of the audio file
92
92
# long enough to handle the downsampling so these tensors will need to be
93
93
# padded with zeros. The minimum length that won’t require padding is
94
94
# 160,000 samples.
95
- #
95
+ #
96
96
97
97
class UrbanSoundDataset (Dataset ):
98
98
#rapper for the UrbanSound8K dataset
99
99
# Argument List
100
100
# path to the UrbanSound8K csv file
101
101
# path to the UrbanSound8K audio files
102
102
# list of folders to use in the dataset
103
-
103
+
104
104
def __init__ (self , csv_path , file_path , folderList ):
105
105
csvData = pd .read_csv (csv_path )
106
106
#initialize lists to hold file names, labels, and folder numbers
@@ -113,11 +113,11 @@ def __init__(self, csv_path, file_path, folderList):
113
113
self .file_names .append (csvData .iloc [i , 0 ])
114
114
self .labels .append (csvData .iloc [i , 6 ])
115
115
self .folders .append (csvData .iloc [i , 5 ])
116
-
116
+
117
117
self .file_path = file_path
118
118
self .mixer = torchaudio .transforms .DownmixMono () #UrbanSound8K uses two channels, this will convert them to one
119
119
self .folderList = folderList
120
-
120
+
121
121
def __getitem__ (self , index ):
122
122
#format the file path and load the file
123
123
path = self .file_path + "fold" + str (self .folders [index ]) + "/" + self .file_names [index ]
@@ -130,17 +130,17 @@ def __getitem__(self, index):
130
130
tempData [:soundData .numel ()] = soundData [:]
131
131
else :
132
132
tempData [:] = soundData [:160000 ]
133
-
133
+
134
134
soundData = tempData
135
135
soundFormatted = torch .zeros ([32000 , 1 ])
136
136
soundFormatted [:32000 ] = soundData [::5 ] #take every fifth sample of soundData
137
137
soundFormatted = soundFormatted .permute (1 , 0 )
138
138
return soundFormatted , self .labels [index ]
139
-
139
+
140
140
def __len__ (self ):
141
141
return len (self .file_names )
142
142
143
-
143
+
144
144
csv_path = './data/UrbanSound8K/metadata/UrbanSound8K.csv'
145
145
file_path = './data/UrbanSound8K/audio/'
146
146
@@ -158,7 +158,7 @@ def __len__(self):
158
158
######################################################################
159
159
# Define the Network
160
160
# ------------------
161
- #
161
+ #
162
162
# For this tutorial we will use a convolutional neural network to process
163
163
# the raw audio data. Usually more advanced transforms are applied to the
164
164
# audio data, however CNNs can be used to accurately process the raw data.
@@ -169,7 +169,7 @@ def __len__(self):
169
169
# processing audio sampled at 8kHz the receptive field is around 10ms.
170
170
# This size is similar to speech processing applications that often use
171
171
# receptive fields ranging from 20ms to 40ms.
172
- #
172
+ #
173
173
174
174
class Net (nn .Module ):
175
175
def __init__ (self ):
@@ -188,7 +188,7 @@ def __init__(self):
188
188
self .pool4 = nn .MaxPool1d (4 )
189
189
self .avgPool = nn .AvgPool1d (30 ) #input should be 512x30 so this outputs a 512x1
190
190
self .fc1 = nn .Linear (512 , 10 )
191
-
191
+
192
192
def forward (self , x ):
193
193
x = self .conv1 (x )
194
194
x = F .relu (self .bn1 (x ))
@@ -217,7 +217,7 @@ def forward(self, x):
217
217
# optimizer with weight decay set to 0.0001. At first, we will train with
218
218
# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
219
219
# to 0.001 during training.
220
- #
220
+ #
221
221
222
222
optimizer = optim .Adam (model .parameters (), lr = 0.01 , weight_decay = 0.0001 )
223
223
scheduler = optim .lr_scheduler .StepLR (optimizer , step_size = 20 , gamma = 0.1 )
@@ -226,10 +226,10 @@ def forward(self, x):
226
226
######################################################################
227
227
# Training and Testing the Network
228
228
# --------------------------------
229
- #
229
+ #
230
230
# Now let’s define a training function that will feed our training data
231
231
# into the model and perform the backward pass and optimization steps.
232
- #
232
+ #
233
233
234
234
def train (model , epoch ):
235
235
model .train ()
@@ -239,7 +239,7 @@ def train(model, epoch):
239
239
target = target .to (device )
240
240
data = data .requires_grad_ () #set requires_grad to True for training
241
241
output = model (data )
242
- output = output .permute (1 , 0 , 2 ) #original output dimensions are batchSizex1x10
242
+ output = output .permute (1 , 0 , 2 ) #original output dimensions are batchSizex1x10
243
243
loss = F .nll_loss (output [0 ], target ) #the loss functions expects a batchSizex10 input
244
244
loss .backward ()
245
245
optimizer .step ()
@@ -256,7 +256,7 @@ def train(model, epoch):
256
256
# variable in all modules in the network to false. Certain layers like
257
257
# batch normalization and dropout layers behave differently during
258
258
# training so this step is crucial for getting correct results.
259
- #
259
+ #
260
260
261
261
def test (model , epoch ):
262
262
model .eval ()
@@ -278,10 +278,13 @@ def test(model, epoch):
278
278
# for ten epochs then reduce the learn rate and train for ten more epochs.
279
279
# The network will be tested after each epoch to see how the accuracy
280
280
# varies during the training.
281
- #
281
+ #
282
+ # .. note:: Due to a build issue, we've reduced the number of epochs to 10.
283
+ # Run this sample with 40 locally to get the proper values.
284
+ #
282
285
283
286
log_interval = 20
284
- for epoch in range (1 , 41 ):
287
+ for epoch in range (1 , 11 ):
285
288
if epoch == 31 :
286
289
print ("First round of training complete. Setting learn rate to 0.001." )
287
290
scheduler .step ()
@@ -292,16 +295,16 @@ def test(model, epoch):
292
295
######################################################################
293
296
# Conclusion
294
297
# ----------
295
- #
298
+ #
296
299
# If trained on 9 folders, the network should be more than 50% accurate by
297
300
# the end of the training process. Training on less folders will result in
298
301
# a lower overall accuracy but may be necessary if long runtimes are a
299
302
# problem. Greater accuracies can be achieved using deeper CNNs at the
300
303
# expense of a larger memory footprint.
301
- #
304
+ #
302
305
# For more advanced audio applications, such as speech recognition,
303
306
# recurrent neural networks (RNNs) are commonly used. There are also other
304
307
# data preprocessing methods, such as finding the mel frequency cepstral
305
308
# coefficients (MFCC), that can reduce the size of the dataset.
306
- #
309
+ #
307
310
0 commit comments