-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
111 lines (87 loc) · 3.07 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import kagglehub
import numpy as np
from numpy.linalg import norm
from speechbrain.inference.speaker import EncoderClassifier
import matplotlib.pyplot as plt
import torchaudio
from ecdf_models import invexp
from scipy.optimize import curve_fit
################## Config settings ##################
# Parameters
MODEL = "speechbrain/spkrec-ecapa-voxceleb"
SAMPLE_SIZE = 100
SEED = 80085
ECDF_MODEL = invexp # Model to fit ECDF curve (if enabled)
# Outputs
PRINT_BOUNDS = True
SHOW_MATRIX = True
SHOW_HIST = True
SHOW_ECDF = True
PRINT_QUANTILES = True
FIT_MODEL_ECDF = True
#####################################################
# Download latest version
# This will cache itself after first download
kagglePath = kagglehub.dataset_download("mozillaorg/common-voice")
# Create generator and set seed
randgen = np.random.default_rng(seed=SEED)
# Select 100 random samples (out of the available 4068)
sample_numbers = randgen.permutation(4067)[:SAMPLE_SIZE]
classifier = EncoderClassifier.from_hparams(source=MODEL, savedir=f"pretrained_models/{MODEL}")
# Create embeddings
def create_embedding(number):
"Returns a normalized embedding of the "
file_path = kagglePath + '/cv-valid-dev/cv-valid-dev/sample-{:0>6}.mp3'.format(number)
signal, _ = torchaudio.load(file_path)
out = classifier.encode_batch(signal).numpy()[0][0]
out = out / norm(out)
return out
# Create embeddings for sample
samples = np.array([create_embedding(num) for num in sample_numbers])
# Compute cosine similarities
scores = np.inner(samples, samples)
scores_flat = scores.flatten()
if PRINT_BOUNDS:
print(f'Max: {max(scores_flat)}')
print(f'Min: {min(scores_flat)}')
# Show a matrix heatmap
if SHOW_MATRIX:
plt.imshow(scores, cmap='viridis', interpolation='none')
plt.colorbar(label='Cosine similarity')
plt.title(f'Similarity scores for {MODEL}')
plt.show()
# Show a histogram of scores
if SHOW_HIST:
plt.hist(scores_flat, bins=20)
plt.title(f'Similarity score distribution for {MODEL}')
plt.xlabel('Cosine similarity')
plt.ylabel('Number of occurences')
plt.show()
# Show the E.C.D.F
if SHOW_ECDF:
ecdf_x = np.sort(scores_flat)
ecdf_y = np.linspace(0, 1, len(scores_flat), endpoint=False)
plt.plot(ecdf_x, ecdf_y)
plt.title(f'E.c.d.f of similarity scores for {MODEL}')
plt.xlabel('Cosine similarity')
plt.ylabel('F(n)')
plt.show()
# Find quantiles
if PRINT_QUANTILES:
quantiles = [0.6, 0.7, 0.75, 0.8, 0.9] # ascending order
for quant in quantiles:
print(f'q({quant}) = {np.quantile(ecdf_x, quant)}')
# Fit a model to the E.C.D.F
if FIT_MODEL_ECDF and ECDF_MODEL:
parameters, covariance = curve_fit(ECDF_MODEL, ecdf_x, ecdf_y)
print(f'Parameters: {parameters}')
print(f'Covariance: {covariance}')
# Overlay gaussian curve fit
plt.plot(ecdf_x, ecdf_y)
x_samp = np.linspace(min(ecdf_x), max(ecdf_x), 1000)
y_samp = ECDF_MODEL(x_samp, *parameters)
plt.plot(x_samp, y_samp)
plt.title(f'E.c.d.f of similarity scores for {MODEL}')
plt.xlabel('Cosine similarity')
plt.ylabel('F(n)')
plt.show()