-
Notifications
You must be signed in to change notification settings - Fork 4.6k
/
Copy pathgaussian_mixture_model.py
121 lines (105 loc) · 4.61 KB
/
gaussian_mixture_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from __future__ import division, print_function
import math
from sklearn import datasets
import numpy as np
from mlfromscratch.utils import normalize, euclidean_distance, calculate_covariance_matrix
from mlfromscratch.utils import Plot
class GaussianMixtureModel():
"""A probabilistic clustering method for determining groupings among data samples.
Parameters:
-----------
k: int
The number of clusters the algorithm will form.
max_iterations: int
The number of iterations the algorithm will run for if it does
not converge before that.
tolerance: float
If the difference of the results from one iteration to the next is
smaller than this value we will say that the algorithm has converged.
"""
def __init__(self, k=2, max_iterations=2000, tolerance=1e-8):
self.k = k
self.parameters = []
self.max_iterations = max_iterations
self.tolerance = tolerance
self.responsibilities = []
self.sample_assignments = None
self.responsibility = None
def _init_random_gaussians(self, X):
""" Initialize gaussian randomly """
n_samples = np.shape(X)[0]
self.priors = (1 / self.k) * np.ones(self.k)
for i in range(self.k):
params = {}
params["mean"] = X[np.random.choice(range(n_samples))]
params["cov"] = calculate_covariance_matrix(X)
self.parameters.append(params)
def multivariate_gaussian(self, X, params):
""" Likelihood """
n_features = np.shape(X)[1]
mean = params["mean"]
covar = params["cov"]
determinant = np.linalg.det(covar)
likelihoods = np.zeros(np.shape(X)[0])
for i, sample in enumerate(X):
d = n_features # dimension
coeff = (1.0 / (math.pow((2.0 * math.pi), d / 2)
* math.sqrt(determinant)))
exponent = math.exp(-0.5 * (sample - mean).T.dot(np.linalg.pinv(covar)).dot((sample - mean)))
likelihoods[i] = coeff * exponent
return likelihoods
def _get_likelihoods(self, X):
""" Calculate the likelihood over all samples """
n_samples = np.shape(X)[0]
likelihoods = np.zeros((n_samples, self.k))
for i in range(self.k):
likelihoods[
:, i] = self.multivariate_gaussian(
X, self.parameters[i])
return likelihoods
def _expectation(self, X):
""" Calculate the responsibility """
# Calculate probabilities of X belonging to the different clusters
weighted_likelihoods = self._get_likelihoods(X) * self.priors
sum_likelihoods = np.expand_dims(
np.sum(weighted_likelihoods, axis=1), axis=1)
# Determine responsibility as P(X|y)*P(y)/P(X)
self.responsibility = weighted_likelihoods / sum_likelihoods
# Assign samples to cluster that has largest probability
self.sample_assignments = self.responsibility.argmax(axis=1)
# Save value for convergence check
self.responsibilities.append(np.max(self.responsibility, axis=1))
def _maximization(self, X):
""" Update the parameters and priors """
# Iterate through clusters and recalculate mean and covariance
for i in range(self.k):
resp = np.expand_dims(self.responsibility[:, i], axis=1)
mean = (resp * X).sum(axis=0) / resp.sum()
covariance = (X - mean).T.dot((X - mean) * resp) / resp.sum()
self.parameters[i]["mean"], self.parameters[
i]["cov"] = mean, covariance
# Update weights
n_samples = np.shape(X)[0]
self.priors = self.responsibility.sum(axis=0) / n_samples
def _converged(self, X):
""" Covergence if || likehood - last_likelihood || < tolerance """
if len(self.responsibilities) < 2:
return False
diff = np.linalg.norm(
self.responsibilities[-1] - self.responsibilities[-2])
# print ("Likelihood update: %s (tol: %s)" % (diff, self.tolerance))
return diff <= self.tolerance
def predict(self, X):
""" Run GMM and return the cluster indices """
# Initialize the gaussians randomly
self._init_random_gaussians(X)
# Run EM until convergence or for max iterations
for _ in range(self.max_iterations):
self._expectation(X) # E-step
self._maximization(X) # M-step
# Check convergence
if self._converged(X):
break
# Make new assignments and return them
self._expectation(X)
return self.sample_assignments