Skip to content

Commit f268a7c

Browse files
author
Schuler Henry Martin (BhP/HRL3.2-SH1)
committed
Added NN test for LPC and LPCC
1 parent d46145c commit f268a7c

File tree

7 files changed

+401
-2
lines changed

7 files changed

+401
-2
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import numpy as np
2+
import librosa
3+
4+
class LPCExtractor:
5+
@staticmethod
6+
def lpc(frames, order=12):
7+
lpc_coefficients = []
8+
9+
for frame in frames:
10+
lpc_coefficients.append(librosa.lpc(y=frame, order=order))
11+
12+
return lpc_coefficients
13+
14+
@staticmethod
15+
def lpcc(lpc_list, order=12):
16+
# atm max lpcc order is lpc order, if lpcc oder is higher, zero filling in performed
17+
lpcc_coefficients = []
18+
19+
for lpc in lpc_list:
20+
lpcc = np.zeros(order+1)
21+
lpcc[0] = lpc[0] # normally the power of the signal is used here
22+
lpcc[1] = lpc[1]
23+
24+
for n in range(2, order+1):
25+
if n < lpc.shape[0]:
26+
lpcc[n] = sum((1-k/n) * lpc[k] * lpcc[n-k] for k in range(1, n)) + lpc[n]
27+
else:
28+
pass # there is a function in literature but it does not make any sense
29+
30+
lpcc_coefficients.append(lpcc)
31+
32+
return lpcc_coefficients
33+
34+
@staticmethod
35+
def get_lpcc_from_frames(frames, order=12):
36+
return LPCExtractor.lpcc(lpc_list=LPCExtractor.lpc(frames=frames, order=order), order=order)
Binary file not shown.

code/main.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
from preprocessing.AudioPreprocessor import AudioPreprocessor
2+
from feature_extraction.LPCExtractor import LPCExtractor
23

34
def main():
4-
frames = AudioPreprocessor.load_preprocessed_frames("./audio.wav")
5-
print(frames)
5+
frames = AudioPreprocessor.load_preprocessed_frames("C:\\Users\\SCU8BH\\Documents\\T3000\\Studienarbeit\\Data\\50_speakers_audio_data\\Speaker_0003\\Speaker_0003_00000.wav")
6+
lpccs = LPCExtractor.get_lpcc_from_frames(frames=frames, order=12)
7+
print(len(lpccs))
68

79
if __name__ == "__main__":
810
main()
Binary file not shown.

code/test.ipynb

Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"from utils.utils import Utils\n",
10+
"from preprocessing.AudioPreprocessor import AudioPreprocessor\n",
11+
"from feature_extraction.LPCExtractor import LPCExtractor\n",
12+
"import numpy as np\n",
13+
"import tensorflow as tf\n",
14+
"from tensorflow import keras"
15+
]
16+
},
17+
{
18+
"cell_type": "code",
19+
"execution_count": 52,
20+
"metadata": {},
21+
"outputs": [],
22+
"source": [
23+
"def unison_shuffled_copies(a, b):\n",
24+
" assert len(a) == len(b)\n",
25+
" p = np.random.permutation(len(a))\n",
26+
" return a[p], b[p]\n",
27+
" \n",
28+
"def get_data_set(count, speakers):\n",
29+
" third = int(count/speakers)\n",
30+
" X = np.zeros((count, 12*20))\n",
31+
" y = np.zeros(count, dtype='uint8')\n",
32+
" \n",
33+
" coefficients_per_speaker = third * 20\n",
34+
" \n",
35+
" all_speakers = []\n",
36+
" for i in range(0, speakers):\n",
37+
" all_speakers.append([])\n",
38+
" index = 0\n",
39+
" while (len(all_speakers[i]) < coefficients_per_speaker):\n",
40+
" print(index, end=\"\\r\")\n",
41+
" y_, sr = Utils.load_file(f\"C:\\\\Users\\\\SCU8BH\\\\Documents\\\\T3000\\\\Studienarbeit\\\\Data\\\\50_speakers_audio_data\\\\Speaker{i+30:04}\\\\Speaker{i+30:02}_{index:03}.wav\")\n",
42+
" \n",
43+
" y_ = AudioPreprocessor.remove_noise(y=y_, sr=sr)\n",
44+
" y_ = AudioPreprocessor.remove_silence(y=y_)\n",
45+
" frames = AudioPreprocessor.create_frames(y=y_, frame_size=500, overlap=100)\n",
46+
" frames = AudioPreprocessor.window_frames(frames=frames)\n",
47+
" \n",
48+
" lpcc = LPCExtractor.lpc(frames=frames, order=12)\n",
49+
" # lpcc = LPCExtractor.lpcc(lpc_list=lpc, order=12)\n",
50+
" \n",
51+
" all_speakers[i] += lpcc\n",
52+
" \n",
53+
" index += 1\n",
54+
" print()\n",
55+
" \n",
56+
" for i in range(0, speakers):\n",
57+
" for j in range(0, third):\n",
58+
" X[i*third + j] = np.concatenate((all_speakers[i][20*j][1:13], \n",
59+
" all_speakers[i][20*j+1][1:13], \n",
60+
" all_speakers[i][20*j+2][1:13],\n",
61+
" all_speakers[i][20*j+3][1:13],\n",
62+
" all_speakers[i][20*j+4][1:13],\n",
63+
" all_speakers[i][20*j+5][1:13],\n",
64+
" all_speakers[i][20*j+6][1:13],\n",
65+
" all_speakers[i][20*j+7][1:13],\n",
66+
" all_speakers[i][20*j+8][1:13],\n",
67+
" all_speakers[i][20*j+9][1:13],\n",
68+
" all_speakers[i][20*j+10][1:13], \n",
69+
" all_speakers[i][20*j+11][1:13], \n",
70+
" all_speakers[i][20*j+12][1:13],\n",
71+
" all_speakers[i][20*j+13][1:13],\n",
72+
" all_speakers[i][20*j+14][1:13],\n",
73+
" all_speakers[i][20*j+15][1:13],\n",
74+
" all_speakers[i][20*j+16][1:13],\n",
75+
" all_speakers[i][20*j+17][1:13],\n",
76+
" all_speakers[i][20*j+18][1:13],\n",
77+
" all_speakers[i][20*j+19][1:13]\n",
78+
" ))\n",
79+
" y[i*third + j] = i\n",
80+
" \n",
81+
" return X, y"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": 53,
87+
"metadata": {},
88+
"outputs": [
89+
{
90+
"name": "stdout",
91+
"output_type": "stream",
92+
"text": [
93+
"8\n",
94+
"11\n",
95+
"9\n",
96+
"10\n",
97+
"10\n"
98+
]
99+
}
100+
],
101+
"source": [
102+
"count = 5500\n",
103+
"speakers = 5\n",
104+
"X, y = get_data_set(count=count, speakers=speakers)"
105+
]
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": 63,
110+
"metadata": {},
111+
"outputs": [
112+
{
113+
"name": "stdout",
114+
"output_type": "stream",
115+
"text": [
116+
"[0 0 0 ... 4 4 4]\n",
117+
"[4 2 3 ... 2 2 4]\n",
118+
"29/29 [==============================] - 0s 1ms/step - loss: 2.1533e-05 - accuracy: 1.0000\n",
119+
"Test accuracy: 1.0\n",
120+
"Test loss: 2.1533451217692345e-05\n",
121+
"4/4 [==============================] - 0s 1ms/step\n",
122+
"[4 2 2 2 2 2 2 2 2 2 2 2 1 4 2 2 1 1 1 1 2 2 4 2 1 2 2 2 2 2 2 4 2 2 2 2 2\n",
123+
" 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 4 1 2 2 2 2 2 1 2 3 1 1 2 2 2 4 2 4 2 2 2\n",
124+
" 2 2 2 2 4 1 0 2 4 2 4 2 4 2 1 2 4 2 3 3 2 2 2 2 2 2 2 4 3 4 1 0 2 1 2 2 4\n",
125+
" 2 2 4 2 2 0 0 0]\n",
126+
"6\n",
127+
"14\n",
128+
"79\n",
129+
"4\n",
130+
"16\n"
131+
]
132+
}
133+
],
134+
"source": [
135+
"def main(X, y, speakers):\n",
136+
" print(y)\n",
137+
" X, y = unison_shuffled_copies(X, y)\n",
138+
" print(y)\n",
139+
" # model takes 10 frames a 12 coefficients\n",
140+
" model = keras.Sequential([\n",
141+
" keras.layers.Flatten(input_shape=[12*20]),\n",
142+
" keras.layers.Dense(16, activation=tf.nn.relu),\n",
143+
" keras.layers.Dense(16, activation=tf.nn.relu),\n",
144+
" keras.layers.Dense(speakers, activation=tf.nn.softmax)\n",
145+
" ])\n",
146+
" \n",
147+
" model.compile(optimizer=tf.optimizers.Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n",
148+
" \n",
149+
" model.fit(X[int(5*count/6):], y[int(5*count/6):], epochs=1000, verbose=0)\n",
150+
" \n",
151+
" test_loss, test_acc = model.evaluate(X[-int(count/6):], y[-int(count/6):])\n",
152+
" \n",
153+
" print(f\"Test accuracy: {test_acc}\")\n",
154+
" print(f\"Test loss: {test_loss}\")\n",
155+
" \n",
156+
" \n",
157+
" y_, sr = Utils.load_file(f\"C:\\\\Users\\\\SCU8BH\\\\Documents\\\\T3000\\\\Studienarbeit\\\\Data\\\\50_speakers_audio_data\\\\Speaker0032\\\\Speaker32_012.wav\")\n",
158+
" \n",
159+
" y_ = AudioPreprocessor.remove_noise(y=y_, sr=sr)\n",
160+
" y_ = AudioPreprocessor.remove_silence(y=y_)\n",
161+
" frames = AudioPreprocessor.create_frames(y=y_, frame_size=500, overlap=100)\n",
162+
" frames = AudioPreprocessor.window_frames(frames=frames)\n",
163+
" \n",
164+
" lpcc = LPCExtractor.lpc(frames=frames, order=12)\n",
165+
" # lpcc = LPCExtractor.lpcc(lpc_list=lpc, order=12)\n",
166+
" \n",
167+
" X = np.zeros((int(len(lpcc)/20), 12*20))\n",
168+
" \n",
169+
" for j in range(0, int(len(lpcc)/20)):\n",
170+
" X[j] = np.concatenate((lpcc[20*j][1:13], \n",
171+
" lpcc[20*j+1][1:13], \n",
172+
" lpcc[20*j+2][1:13],\n",
173+
" lpcc[20*j+3][1:13],\n",
174+
" lpcc[20*j+4][1:13],\n",
175+
" lpcc[20*j+5][1:13],\n",
176+
" lpcc[20*j+6][1:13],\n",
177+
" lpcc[20*j+7][1:13],\n",
178+
" lpcc[20*j+8][1:13],\n",
179+
" lpcc[20*j+9][1:13],\n",
180+
" lpcc[20*j+10][1:13], \n",
181+
" lpcc[20*j+11][1:13], \n",
182+
" lpcc[20*j+12][1:13],\n",
183+
" lpcc[20*j+13][1:13],\n",
184+
" lpcc[20*j+14][1:13],\n",
185+
" lpcc[20*j+15][1:13],\n",
186+
" lpcc[20*j+16][1:13],\n",
187+
" lpcc[20*j+17][1:13],\n",
188+
" lpcc[20*j+18][1:13],\n",
189+
" lpcc[20*j+19][1:13]\n",
190+
" ))\n",
191+
" if X.shape[0] > 100:\n",
192+
" X_2 = X[-100:]\n",
193+
" pred = model.predict(X)\n",
194+
" print(np.argmax(pred, axis=1))\n",
195+
" print(np.count_nonzero(np.argmax(pred, axis=1) == 0))\n",
196+
" print(np.count_nonzero(np.argmax(pred, axis=1) == 1))\n",
197+
" print(np.count_nonzero(np.argmax(pred, axis=1) == 2))\n",
198+
" print(np.count_nonzero(np.argmax(pred, axis=1) == 3))\n",
199+
" print(np.count_nonzero(np.argmax(pred, axis=1) == 4))\n",
200+
" # print(y[-100:])\n",
201+
" \n",
202+
" \n",
203+
"if __name__ == \"__main__\":\n",
204+
" main(X, y, speakers)"
205+
]
206+
}
207+
],
208+
"metadata": {
209+
"kernelspec": {
210+
"display_name": "Python 3.10.4 64-bit",
211+
"language": "python",
212+
"name": "python3"
213+
},
214+
"language_info": {
215+
"codemirror_mode": {
216+
"name": "ipython",
217+
"version": 3
218+
},
219+
"file_extension": ".py",
220+
"mimetype": "text/x-python",
221+
"name": "python",
222+
"nbconvert_exporter": "python",
223+
"pygments_lexer": "ipython3",
224+
"version": "3.10.4"
225+
},
226+
"orig_nbformat": 4,
227+
"vscode": {
228+
"interpreter": {
229+
"hash": "2fc4d7ba6602d69fe52dcf13f0361bb9556610661c910f56182baab83bdef03f"
230+
}
231+
}
232+
},
233+
"nbformat": 4,
234+
"nbformat_minor": 2
235+
}

0 commit comments

Comments
 (0)