Skip to content

Commit fbd68a5

Browse files
brianjoJoelMarcey
authored andcommitted
Reducing epochs in audio_classifier to fix build issue. (pytorch#398)
1 parent 0ae7340 commit fbd68a5

File tree

1 file changed

+29
-26
lines changed

1 file changed

+29
-26
lines changed

beginner_source/audio_classifier_tutorial.py

+29-26
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
######################################################################
2727
# Let’s check if a CUDA GPU is available and select our device. Running
2828
# the network on a GPU will greatly decrease the training/testing runtime.
29-
#
29+
#
3030

3131
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
3232
print(device)
@@ -35,15 +35,15 @@
3535
######################################################################
3636
# Importing the Dataset
3737
# ---------------------
38-
#
38+
#
3939
# We will use the UrbanSound8K dataset to train our network. It is
4040
# available for free `here <https://urbansounddataset.weebly.com/>`_ and contains
4141
# 10 audio classes with over 8000 audio samples! Once you have downloaded
4242
# the compressed dataset, extract it to your current working directory.
4343
# First, we will look at the csv file that provides information about the
4444
# individual sound files. ``pandas`` allows us to open the csv file and
4545
# use ``.iloc()`` to access the data within it.
46-
#
46+
#
4747

4848
csvData = pd.read_csv('./data/UrbanSound8K/metadata/UrbanSound8K.csv')
4949
print(csvData.iloc[0, :])
@@ -55,7 +55,7 @@
5555
# gun_shot, jackhammer, siren, and street_music. Let’s play a couple files
5656
# and see what they sound like. The first file is street music and the
5757
# second is an air conditioner.
58-
#
58+
#
5959

6060
import IPython.display as ipd
6161
ipd.Audio('./data/UrbanSound8K/audio/fold1/108041-9-0-5.wav')
@@ -66,7 +66,7 @@
6666
######################################################################
6767
# Formatting the Data
6868
# -------------------
69-
#
69+
#
7070
# Now that we know the format of the csv file entries, we can construct
7171
# our dataset. We will create a rapper class for our dataset using
7272
# ``torch.utils.data.Dataset`` that will handle loading the files and
@@ -76,7 +76,7 @@
7676
# class will store the file names, labels, and folder numbers of the audio
7777
# files in the inputted folder list when initialized. The actual loading
7878
# and formatting steps will happen in the access function ``__getitem__``.
79-
#
79+
#
8080
# In ``__getitem__``, we use ``torchaudio.load()`` to convert the wav
8181
# files to tensors. ``torchaudio.load()`` returns a tuple containing the
8282
# newly created tensor along with the sampling frequency of the audio file
@@ -92,15 +92,15 @@
9292
# long enough to handle the downsampling so these tensors will need to be
9393
# padded with zeros. The minimum length that won’t require padding is
9494
# 160,000 samples.
95-
#
95+
#
9696

9797
class UrbanSoundDataset(Dataset):
9898
#rapper for the UrbanSound8K dataset
9999
# Argument List
100100
# path to the UrbanSound8K csv file
101101
# path to the UrbanSound8K audio files
102102
# list of folders to use in the dataset
103-
103+
104104
def __init__(self, csv_path, file_path, folderList):
105105
csvData = pd.read_csv(csv_path)
106106
#initialize lists to hold file names, labels, and folder numbers
@@ -113,11 +113,11 @@ def __init__(self, csv_path, file_path, folderList):
113113
self.file_names.append(csvData.iloc[i, 0])
114114
self.labels.append(csvData.iloc[i, 6])
115115
self.folders.append(csvData.iloc[i, 5])
116-
116+
117117
self.file_path = file_path
118118
self.mixer = torchaudio.transforms.DownmixMono() #UrbanSound8K uses two channels, this will convert them to one
119119
self.folderList = folderList
120-
120+
121121
def __getitem__(self, index):
122122
#format the file path and load the file
123123
path = self.file_path + "fold" + str(self.folders[index]) + "/" + self.file_names[index]
@@ -130,17 +130,17 @@ def __getitem__(self, index):
130130
tempData[:soundData.numel()] = soundData[:]
131131
else:
132132
tempData[:] = soundData[:160000]
133-
133+
134134
soundData = tempData
135135
soundFormatted = torch.zeros([32000, 1])
136136
soundFormatted[:32000] = soundData[::5] #take every fifth sample of soundData
137137
soundFormatted = soundFormatted.permute(1, 0)
138138
return soundFormatted, self.labels[index]
139-
139+
140140
def __len__(self):
141141
return len(self.file_names)
142142

143-
143+
144144
csv_path = './data/UrbanSound8K/metadata/UrbanSound8K.csv'
145145
file_path = './data/UrbanSound8K/audio/'
146146

@@ -158,7 +158,7 @@ def __len__(self):
158158
######################################################################
159159
# Define the Network
160160
# ------------------
161-
#
161+
#
162162
# For this tutorial we will use a convolutional neural network to process
163163
# the raw audio data. Usually more advanced transforms are applied to the
164164
# audio data, however CNNs can be used to accurately process the raw data.
@@ -169,7 +169,7 @@ def __len__(self):
169169
# processing audio sampled at 8kHz the receptive field is around 10ms.
170170
# This size is similar to speech processing applications that often use
171171
# receptive fields ranging from 20ms to 40ms.
172-
#
172+
#
173173

174174
class Net(nn.Module):
175175
def __init__(self):
@@ -188,7 +188,7 @@ def __init__(self):
188188
self.pool4 = nn.MaxPool1d(4)
189189
self.avgPool = nn.AvgPool1d(30) #input should be 512x30 so this outputs a 512x1
190190
self.fc1 = nn.Linear(512, 10)
191-
191+
192192
def forward(self, x):
193193
x = self.conv1(x)
194194
x = F.relu(self.bn1(x))
@@ -217,7 +217,7 @@ def forward(self, x):
217217
# optimizer with weight decay set to 0.0001. At first, we will train with
218218
# a learning rate of 0.01, but we will use a ``scheduler`` to decrease it
219219
# to 0.001 during training.
220-
#
220+
#
221221

222222
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
223223
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)
@@ -226,10 +226,10 @@ def forward(self, x):
226226
######################################################################
227227
# Training and Testing the Network
228228
# --------------------------------
229-
#
229+
#
230230
# Now let’s define a training function that will feed our training data
231231
# into the model and perform the backward pass and optimization steps.
232-
#
232+
#
233233

234234
def train(model, epoch):
235235
model.train()
@@ -239,7 +239,7 @@ def train(model, epoch):
239239
target = target.to(device)
240240
data = data.requires_grad_() #set requires_grad to True for training
241241
output = model(data)
242-
output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x10
242+
output = output.permute(1, 0, 2) #original output dimensions are batchSizex1x10
243243
loss = F.nll_loss(output[0], target) #the loss functions expects a batchSizex10 input
244244
loss.backward()
245245
optimizer.step()
@@ -256,7 +256,7 @@ def train(model, epoch):
256256
# variable in all modules in the network to false. Certain layers like
257257
# batch normalization and dropout layers behave differently during
258258
# training so this step is crucial for getting correct results.
259-
#
259+
#
260260

261261
def test(model, epoch):
262262
model.eval()
@@ -278,10 +278,13 @@ def test(model, epoch):
278278
# for ten epochs then reduce the learn rate and train for ten more epochs.
279279
# The network will be tested after each epoch to see how the accuracy
280280
# varies during the training.
281-
#
281+
#
282+
# .. note:: Due to a build issue, we've reduced the number of epochs to 10.
283+
# Run this sample with 40 locally to get the proper values.
284+
#
282285

283286
log_interval = 20
284-
for epoch in range(1, 41):
287+
for epoch in range(1, 11):
285288
if epoch == 31:
286289
print("First round of training complete. Setting learn rate to 0.001.")
287290
scheduler.step()
@@ -292,16 +295,16 @@ def test(model, epoch):
292295
######################################################################
293296
# Conclusion
294297
# ----------
295-
#
298+
#
296299
# If trained on 9 folders, the network should be more than 50% accurate by
297300
# the end of the training process. Training on less folders will result in
298301
# a lower overall accuracy but may be necessary if long runtimes are a
299302
# problem. Greater accuracies can be achieved using deeper CNNs at the
300303
# expense of a larger memory footprint.
301-
#
304+
#
302305
# For more advanced audio applications, such as speech recognition,
303306
# recurrent neural networks (RNNs) are commonly used. There are also other
304307
# data preprocessing methods, such as finding the mel frequency cepstral
305308
# coefficients (MFCC), that can reduce the size of the dataset.
306-
#
309+
#
307310

0 commit comments

Comments
 (0)