forked from pytorch/tutorials
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaudio_classifier_tutorial.py
310 lines (264 loc) · 12.5 KB
/
audio_classifier_tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
"""
Audio Classifier Tutorial
=========================
**Author**: `Winston Herring <https://github.com/winston6>`_
This tutorial will show you how to correctly format an audio dataset and
then train/test an audio classifier network on the dataset. First, let’s
import the common torch packages as well as ``torchaudio``, ``pandas``,
and ``numpy``. ``torchaudio`` is available `here <https://github.com/pytorch/audio>`_
and can be installed by following the
instructions on the website.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import torchaudio
import pandas as pd
import numpy as np
######################################################################
# Let’s check if a CUDA GPU is available and select our device. Running
# the network on a GPU will greatly decrease the training/testing runtime.
#
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
######################################################################
# Importing the Dataset
# ---------------------
#
# We will use the UrbanSound8K dataset to train our network. It is
# available for free `here <https://urbansounddataset.weebly.com/>`_ and contains
# 10 audio classes with over 8000 audio samples! Once you have downloaded
# the compressed dataset, extract it to your current working directory.
# First, we will look at the csv file that provides information about the
# individual sound files. ``pandas`` allows us to open the csv file and
# use ``.iloc()`` to access the data within it.
#
csvData = pd.read_csv('./data/UrbanSound8K/metadata/UrbanSound8K.csv')
print(csvData.iloc[0, :])
######################################################################
# The 10 audio classes in the UrbanSound8K dataset are air_conditioner,
# car_horn, children_playing, dog_bark, drilling, enginge_idling,
# gun_shot, jackhammer, siren, and street_music. Let’s play a couple files
# and see what they sound like. The first file is street music and the
# second is an air conditioner.
#
import IPython.display as ipd
ipd.Audio('./data/UrbanSound8K/audio/fold1/108041-9-0-5.wav')
ipd.Audio('./data/UrbanSound8K/audio/fold5/100852-0-0-19.wav')
######################################################################
# Formatting the Data
# -------------------
#
# Now that we know the format of the csv file entries, we can construct
# our dataset. We will create a rapper class for our dataset using
# ``torch.utils.data.Dataset`` that will handle loading the files and
# performing some formatting steps. The UrbanSound8K dataset is separated
# into 10 folders. We will use the data from 9 of these folders to train
# our network and then use the 10th folder to test the network. The rapper
# class will store the file names, labels, and folder numbers of the audio
# files in the inputted folder list when initialized. The actual loading
# and formatting steps will happen in the access function ``__getitem__``.
#
# In ``__getitem__``, we use ``torchaudio.load()`` to convert the wav
# files to tensors. ``torchaudio.load()`` returns a tuple containing the
# newly created tensor along with the sampling frequency of the audio file
# (44.1kHz for UrbanSound8K). The dataset uses two channels for audio so
# we will use ``torchaudio.transforms.DownmixMono()`` to convert the audio
# data to one channel. Next, we need to format the audio data. The network
# we will make takes an input size of 32,000, while most of the audio
# files have well over 100,000 samples. The UrbanSound8K audio is sampled
# at 44.1kHz, so 32,000 samples only covers around 700 milliseconds. By
# downsampling the audio to aproximately 8kHz, we can represent 4 seconds
# with the 32,000 samples. This downsampling is achieved by taking every
# fifth sample of the original audio tensor. Not every audio tensor is
# long enough to handle the downsampling so these tensors will need to be
# padded with zeros. The minimum length that won’t require padding is
# 160,000 samples.
#
class UrbanSoundDataset(Dataset):
#rapper for the UrbanSound8K dataset
# Argument List
# path to the UrbanSound8K csv file
# path to the UrbanSound8K audio files
# list of folders to use in the dataset
def __init__(self, csv_path, file_path, folderList):
csvData = pd.read_csv(csv_path)
#initialize lists to hold file names, labels, and folder numbers
self.file_names = []
self.labels = []
self.folders = []
#loop through the csv entries and only add entries from folders in the folder list
for i in range(0,len(csvData)):
if csvData.iloc[i, 5] in folderList:
self.file_names.append(csvData.iloc[i, 0])
self.labels.append(csvData.iloc[i, 6])
self.folders.append(csvData.iloc[i, 5])
self.file_path = file_path
self.mixer = torchaudio.transforms.DownmixMono() #UrbanSound8K uses two channels, this will convert them to one
self.folderList = folderList
def __getitem__(self, index):
#format the file path and load the file
path = self.file_path + "fold" + str(self.folders[index]) + "/" + self.file_names[index]
sound = torchaudio.load(path, out = None, normalization = True)
#load returns a tensor with the sound data and the sampling frequency (44.1kHz for UrbanSound8K)
soundData = self.mixer(sound[0])
#downsample the audio to ~8kHz
tempData = torch.zeros([160000, 1]) #tempData accounts for audio clips that are too short
if soundData.numel() < 160000:
tempData[:soundData.numel()] = soundData[:]
else:
tempData[:] = soundData[:160000]
soundData = tempData
soundFormatted = torch.zeros([32000, 1])
soundFormatted[:32000] = soundData[::5] #take every fifth sample of soundData
soundFormatted = soundFormatted.permute(1, 0)
return soundFormatted, self.labels[index]
def __len__(self):
return len(self.file_names)
csv_path = './data/UrbanSound8K/metadata/UrbanSound8K.csv'
file_path = './data/UrbanSound8K/audio/'
train_set = UrbanSoundDataset(csv_path, file_path, range(1,10))
test_set = UrbanSoundDataset(csv_path, file_path, [10])
print("Train set size: " + str(len(train_set)))
print("Test set size: " + str(len(test_set)))
kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu
train_loader = torch.utils.data.DataLoader(train_set, batch_size = 128, shuffle = True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = 128, shuffle = True, **kwargs)
######################################################################
# Define the Network
# ------------------
#
# For this tutorial we will use a convolutional neural network to process
# the raw audio data. Usually more advanced transforms are applied to the
# audio data, however CNNs can be used to accurately process the raw data.
# The specific architecture is modeled after the M5 network architecture
# described in https://arxiv.org/pdf/1610.00087.pdf. An important aspect
# of models processing raw audio data is the receptive field of their
# first layer’s filters. Our model’s first filter is length 80 so when
# processing audio sampled at 8kHz the receptive field is around 10ms.
# This size is similar to speech processing applications that often use
# receptive fields ranging from 20ms to 40ms.
#
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv1d(1, 128, 80, 4)
self.bn1 = nn.BatchNorm1d(128)
self.pool1 = nn.MaxPool1d(4)
self.conv2 = nn.Conv1d(128, 128, 3)
self.bn2 = nn.BatchNorm1d(128)
self.pool2 = nn.MaxPool1d(4)
self.conv3 = nn.Conv1d(128, 256, 3)
self.bn3 = nn.BatchNorm1d(256)
self.pool3 = nn.MaxPool1d(4)
self.conv4 = nn.Conv1d(256, 512, 3)
self.bn4 = nn.BatchNorm1d(512)
self.pool4 = nn.MaxPool1d(4)
self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1
self.fc1 = nn.Linear(512, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(self.bn1(x))
x = self.pool1(x)
x = self.conv2(x)
x = F.relu(self.bn2(x))
x = self.pool2(x)
x = self.conv3(x)
x = F.relu(self.bn3(x))
x = self.pool3(x)
x = self.conv4(x)
x = F.relu(self.bn4(x))
x = self.pool4(x)
x = self.avgPool(x)
x = x.permute(0, 2, 1) #change the 512x1 to 1x512
x = self.fc1(x)
return F.log_softmax(x, dim = 2)
model = Net()
model.to(device)
print(model)
######################################################################
# We will use the same optimization technique used in the paper, an Adam
# optimizer with weight decay set to 0.0001. At first, we will train with
# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
# to 0.001 during training.
#
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)
######################################################################
# Training and Testing the Network
# --------------------------------
#
# Now let’s define a training function that will feed our training data
# into the model and perform the backward pass and optimization steps.
#
def train(model, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
data = data.to(device)
target = target.to(device)
data = data.requires_grad_() #set requires_grad to True for training
output = model(data)
output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x10
loss = F.nll_loss(output[0], target) #the loss functions expects a batchSizex10 input
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0: #print training stats
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss))
######################################################################
# Now that we have a training function, we need to make one for testing
# the networks accuracy. We will set the model to ``eval()`` mode and then
# run inference on the test dataset. Calling ``eval()`` sets the training
# variable in all modules in the network to false. Certain layers like
# batch normalization and dropout layers behave differently during
# training so this step is crucial for getting correct results.
#
def test(model, epoch):
model.eval()
correct = 0
for data, target in test_loader:
data = data.to(device)
target = target.to(device)
output = model(data)
output = output.permute(1, 0, 2)
pred = output.max(2)[1] # get the index of the max log-probability
correct += pred.eq(target).cpu().sum().item()
print('\nTest set: Accuracy: {}/{} ({:.0f}%)\n'.format(
correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
######################################################################
# Finally, we can train and test the network. We will train the network
# for ten epochs then reduce the learn rate and train for ten more epochs.
# The network will be tested after each epoch to see how the accuracy
# varies during the training.
#
# .. note:: Due to a build issue, we've reduced the number of epochs to 10.
# Run this sample with 40 locally to get the proper values.
#
log_interval = 20
for epoch in range(1, 11):
if epoch == 31:
print("First round of training complete. Setting learn rate to 0.001.")
scheduler.step()
train(model, epoch)
test(model, epoch)
######################################################################
# Conclusion
# ----------
#
# If trained on 9 folders, the network should be more than 50% accurate by
# the end of the training process. Training on less folders will result in
# a lower overall accuracy but may be necessary if long runtimes are a
# problem. Greater accuracies can be achieved using deeper CNNs at the
# expense of a larger memory footprint.
#
# For more advanced audio applications, such as speech recognition,
# recurrent neural networks (RNNs) are commonly used. There are also other
# data preprocessing methods, such as finding the mel frequency cepstral
# coefficients (MFCC), that can reduce the size of the dataset.
#