forked from lazyprogrammer/machine_learning_examples
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans.py
104 lines (78 loc) · 2.67 KB
/
kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
import numpy as np
import matplotlib.pyplot as plt
def d(u, v):
diff = u - v
return diff.dot(diff)
def cost(X, R, M):
cost = 0
for k in xrange(len(M)):
# method 1
# for n in xrange(len(X)):
# cost += R[n,k]*d(M[k], X[n])
# method 2
diff = X - M[k]
sq_distances = (diff * diff).sum(axis=1)
cost += (R[:,k] * sq_distances).sum()
return cost
def plot_k_means(X, K, max_iter=20, beta=1.0, show_plots=True):
N, D = X.shape
M = np.zeros((K, D))
# R = np.zeros((N, K))
exponents = np.empty((N, K))
# initialize M to random
for k in xrange(K):
M[k] = X[np.random.choice(N)]
costs = np.zeros(max_iter)
for i in xrange(max_iter):
# step 1: determine assignments / resposibilities
# is this inefficient?
for k in xrange(K):
for n in xrange(N):
# R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in xrange(K) )
exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
R = exponents / exponents.sum(axis=1, keepdims=True)
# assert(np.abs(R - R2).sum() < 10e-10)
# step 2: recalculate means
for k in xrange(K):
M[k] = R[:,k].dot(X) / R[:,k].sum()
costs[i] = cost(X, R, M)
if i > 0:
if np.abs(costs[i] - costs[i-1]) < 10e-5:
break
if show_plots:
plt.plot(costs)
plt.title("Costs")
plt.show()
random_colors = np.random.random((K, 3))
colors = R.dot(random_colors)
plt.scatter(X[:,0], X[:,1], c=colors)
plt.show()
return M, R
def get_simple_data():
# assume 3 means
D = 2 # so we can visualize it more easily
s = 4 # separation so we can control how far apart the means are
mu1 = np.array([0, 0])
mu2 = np.array([s, s])
mu3 = np.array([0, s])
N = 900 # number of samples
X = np.zeros((N, D))
X[:300, :] = np.random.randn(300, D) + mu1
X[300:600, :] = np.random.randn(300, D) + mu2
X[600:, :] = np.random.randn(300, D) + mu3
return X
def main():
X = get_simple_data()
# what does it look like without clustering?
plt.scatter(X[:,0], X[:,1])
plt.show()
K = 3 # luckily, we already know this
plot_k_means(X, K)
K = 5 # what happens if we choose a "bad" K?
plot_k_means(X, K, max_iter=30)
K = 5 # what happens if we change beta?
plot_k_means(X, K, max_iter=30, beta=0.3)
if __name__ == '__main__':
main()