-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathaudio_enc.py
157 lines (108 loc) · 4.31 KB
/
audio_enc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Inspiration at: https://medium.com/@LeonFedden/comparative-audio-analysis-with-wavenet-mfccs-umap-t-sne-and-pca-cb8237bfce2f
import os
import librosa
import librosa.display
import numpy as np
from timeit import default_timer as timer
import matplotlib.pyplot as plt
file_path = "INPUTSAMPLE.wav"
""" Loading a whole folder:
directory = './path/to/my/audio/folder/'
for file in os.listdir(directory):
if file.endswith('.wav'):
file_path = os.path.join(directory, file)
audio_data, _ = librosa.load(file_path)
"""
# LIBROSA:
# https://librosa.github.io/librosa/master/feature.html?highlight=features
# 1 - MFCC
print("1 - MFCC")
sample_rate = 44100
sample_rate = 22050
mfcc_size = 13
# Load the audio
pcm_data, _ = librosa.load(file_path, duration=1) # load 1 sec!
_ = librosa.feature.mfcc(pcm_data, sample_rate, n_mfcc=mfcc_size)
start = timer()
# Compute a vector of n * 13 mfccs
mfccs = librosa.feature.mfcc(pcm_data, sample_rate, n_mfcc=mfcc_size)
end = timer()
time = (end - start)
print("mfccs run took " + str(time) + "s (" + str(time / 60.0) + "min)")
print("np.asarray(mfccs).shape", np.asarray(mfccs).shape) # 1sec = (13, 44) = 572 features! // 2sec = (13, 87)
# CPU - mfccs run took 0.004294439999284805s (7.157399998808008e-05min)
plt.figure(figsize=(10, 4))
librosa.display.specshow(mfccs, x_axis='time')
plt.colorbar()
plt.title('MFCC')
plt.tight_layout()
plt.show()
############################
start = timer()
chroma_cq = librosa.feature.chroma_cqt(pcm_data, sample_rate)
end = timer()
time = (end - start)
print("chroma_cq run took " + str(time) + "s (" + str(time / 60.0) + "min)")
print("np.asarray(chroma_cq).shape", np.asarray(chroma_cq).shape)
# chroma_cq run took 0.20499863999793888s (0.003416643999965648min)
# np.asarray(chroma_cq).shape (12, 44)
plt.figure()
librosa.display.specshow(chroma_cq, y_axis='chroma', x_axis='time')
plt.title('chroma_cqt')
plt.colorbar()
plt.tight_layout()
plt.show()
############################
start = timer()
#S = np.abs(librosa.stft(pcm_data))
#chroma = librosa.feature.chroma_stft(S=S, sr=sample_rate)
S = np.abs(librosa.stft(pcm_data, n_fft=4096))**2
chroma = librosa.feature.chroma_stft(S=S, sr=sample_rate)
end = timer()
time = (end - start)
print("chroma run took " + str(time) + "s (" + str(time / 60.0) + "min)")
print("np.asarray(chroma).shape", np.asarray(chroma).shape)
# chroma run took 0.010794457000883995s (0.0001799076166813999min)
# np.asarray(chroma).shape (12, 22)
plt.figure(figsize=(10, 4))
librosa.display.specshow(chroma, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('Chromagram')
plt.tight_layout()
plt.show()
######################################################################################################################################################
# 2 - WaveNet
# install magenta -> https://github.com/tensorflow/magenta
# source activate magenta
print("2 - WaveNet")
from magenta.models.nsynth import utils
from magenta.models.nsynth.wavenet import fastgen
def wavenet_encode(file_path):
# Load the model weights.
checkpoint_path = './wavenet-ckpt/model.ckpt-200000'
# Load and downsample the audio.
neural_sample_rate = 16000
audio = utils.load_audio(file_path,
sample_length=400000,
sr=neural_sample_rate)
# Pass the audio through the first half of the autoencoder,
# to get a list of latent variables that describe the sound.
# Note that it would be quicker to pass a batch of audio
# to fastgen.
encoding = fastgen.encode(audio, checkpoint_path, len(audio))
# Reshape to a single sound.
return encoding.reshape((-1, 16))
_ = wavenet_encode(file_path)
# An array of n * 16 frames.
start = timer()
wavenet_z_data = wavenet_encode(file_path)
end = timer()
time = (end - start)
print("wavenet_encode run took " + str(time) + "s (" + str(time / 60.0) + "min)")
# Magenta-CPU wavenet_encode run took 30.804440063000584s (0.5134073343833431min)
# Magenta-GPU wavenet_encode run took 12.997072504000243s (0.2166178750666707min) <<<< STILL VERY SLOW!
# visualize with:
# 16 colors each one as one line in plot?
# reconstruct with:
# fastgen.synthesize (like in enc > z < dec architecture)
print("np.asarray(wavenet_z_data).shape", np.asarray(wavenet_z_data).shape) # np.asarray(wavenet_z_data).shape (781, 16)