-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKMeans.py
230 lines (199 loc) · 10.4 KB
/
KMeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import numpy as np
import pandas as pd
import random
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import math
import threading
import time
import os
class KMeans:
def __init__(self,inputFile,k = -1,iterations = 10):
''' Initializing the Classifier '''
# Reading the input data from the .csv file and creating a Pandas DataFrame
self.__inputDataFrame = pd.read_csv('./inputs/' + inputFile)
# No of Training Examples
self.__m = self.__inputDataFrame.shape[0]
# No of features
self.__n = self.__inputDataFrame.shape[1]
# self.__predefinedK stores if the user wants to number of clusters to predefined or not
self.__predefinedK = True if(k != -1) else False
# self.k saves the number of clusters formed
self.__k = 1 if(k == -1) else k
# Taking Max and Min Value of all features to make random initializations
self.__maxFeatures = self.__inputDataFrame.max()
self.__minFeatures = self.__inputDataFrame.min()
# self.__iter saves the number of random initiations for a value of k user wants
self.__iter = iterations
# self.__class stores the classes of the Training Examples
self.__output = {}
# self.__outputPath saves the path where graphs and file containing the centroids are saved
self.__outputPath = './outputs/' + inputFile.split(".")[0] + '/'
if not os.path.exists(self.__outputPath):
os.makedirs(self.__outputPath)
def __initializeCentroids(self,k):
''' Initialize the Centroids and return Panda DataFrame '''
centroids = []
probab = [0 for i in range(self.__m)]
# Initializing the first centroid as a random point of the data
firstCentroid = self.__inputDataFrame.iloc[random.randint(0, self.__m - 1)]
centroids.append([firstCentroid[j] for j in range(self.__n)])
# Initializing more centroids based on Maximum distance
for j in range(k-1):
allDistanceList = self.__calculateCentroidsDistance(pd.DataFrame(centroids, columns = self.__inputDataFrame.columns.values),j+1)
for i in range(self.__m):
probab[i] = min([allDistanceList[m][i] for m in range(j+1)])
nextCentroid = self.__inputDataFrame.iloc[probab.index(max(probab))]
centroids.append([nextCentroid[j] for j in range(self.__n)])
return pd.DataFrame(centroids, columns = self.__inputDataFrame.columns.values)
def __calculateCentroidsDistanceThreads(self,centroids,j,distanceList):
''' Threads To make Calculations Faster '''
# Looping over all Training Examples
for i in range(self.__m):
# Distance from the centroid is initialized to be Zero
distance = 0
# Looping over all Feature Values and Calculating Euclidean Distance
for n in range(self.__n):
distance += pow(self.__inputDataFrame.iloc[i][n] - centroids.iloc[j][n], 2)
distanceList.append(distance)
def __calculateCentroidsDistance(self,centroids,k):
''' Function to calculate Distance of Points from Centroids by Launching Threads '''
# Looping over all Centroids
allDistanceList = []
threads = []
for j in range(k):
distanceList = []
t = threading.Thread(target=self.__calculateCentroidsDistanceThreads, args=(centroids, j, distanceList,))
t.start()
threads.append(t)
allDistanceList.append(distanceList)
# Waiting Until all Threads Join
for j in range(k):
threads[j].join()
return allDistanceList
def __assignClusters(self,centroids,k):
''' Function which assigns the cluster to every Training Example by allocating the class of the nearest centroid '''
classified = np.array([-1 for i in range(self.__m)])
allDistanceList = self.__calculateCentroidsDistance(centroids,k)
# Assigning Cluster to the Training Examples
for i in range(self.__m):
values = [allDistanceList[j][i] for j in range(k)]
classified[i] = values.index(min(values))
return classified
def __recomputeCentroidsThread(self,centroids,classified,j):
''' Threads To make Calculations Faster '''
# Calculating all the Training Examples which are classified under the class of this centroid
indeces = np.where(classified == j)[0]
# Looping over all the features of the centroids and recomputing its coordinates with mean of all Training Examples
for colName in self.__inputDataFrame.columns.values:
centroids.iloc[j][colName] = self.__inputDataFrame.iloc[indeces][colName].mean()
def __recomputeCentroids(self,centroids,k,classified):
''' Reassigning the coordinates to the Centroids of the clusters '''
threads = []
# Looping over all the centroids
for j in range(k):
t = threading.Thread(target=self.__recomputeCentroidsThread, args=(centroids, classified, j,))
t.start()
threads.append(t)
# Waiting Until all Threads Join
for j in range(k):
threads[j].join()
def __showGraph2D(self,centroids,classified,k):
''' Function to show K Mean for 2 Feature Data '''
x,y = self.__inputDataFrame.columns.values
plt.scatter(self.__inputDataFrame[x],self.__inputDataFrame[y],c = classified,marker = ".",s = 50)
plt.scatter(centroids[x],centroids[y],c = [i for i in range(k)],marker = "*",s = 70)
plt.xlabel(x)
plt.ylabel(y)
plt.title('K Means Graph')
plt.savefig(self.__outputPath + '2D Clustered Data.png', bbox_inches='tight')
plt.close()
def __showGraph3D(self,centroids,classified,k):
''' Function to show K Mean for 3 Feature Data '''
x,y,z = self.__inputDataFrame.columns.values
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(self.__inputDataFrame[x],self.__inputDataFrame[y],self.__inputDataFrame[z],c = classified,marker = ".",s = 50)
ax.scatter(centroids[x],centroids[y],centroids[z],c = [i for i in range(k)],marker = "*",s = 70)
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
plt.savefig(self.__outputPath + '3D Clustered Data.png',bbox_inches='tight')
plt.close()
def __showGraphObjectiveFuntion2D(self):
''' Function to show K v/s Objective Function Value '''
x = [key for key in self.__output]
y = [math.log(value[1],10) for key,value in self.__output.items()]
# Calculating the slopes by taking difference in y as change in x is a unit
slopes = [j-i for i, j in zip(y[:-1], y[1:])]
# deltaSlopes save the values of change in slope to find the Elbow Point
deltaSlopes = [j-i for i, j in zip(slopes[:-1], slopes[1:])]
self.__elbowPoint = deltaSlopes.index(max(deltaSlopes)) + 2
plt.scatter(x, y)
plt.plot(x, y)
plt.xlabel('K')
plt.ylabel('log(Objective Function Value)')
plt.savefig(self.__outputPath + 'Objective Function vs K.png', bbox_inches='tight')
plt.close()
def __computeObjectiveFunction(self,centroids,classified):
''' Function computes the value of the objective function '''
rssFunction = 0
for i in range(self.__m):
distance = 0
for j in range(self.__n):
distance += pow(self.__inputDataFrame.iloc[i][j]-centroids.iloc[classified[i]][j],2)
rssFunction += pow(distance,0.5)
return rssFunction
def __runIteration(self,k):
''' Function which runs the K-Means Algo '''
centroids = self.__initializeCentroids(k)
# lastClassified tells if there are any changes in classification or not
lastClassified = np.array([[-1 for i in range(self.__m)]])
classified = np.array([])
while True:
classified = self.__assignClusters(centroids,k)
# If the current classification is equal to the previous classification then clusters have been formed. Stop Calculations
if (lastClassified == classified).all():
break
self.__recomputeCentroids(centroids,k,classified)
lastClassified = classified
# Calculating the Objective Function to check that
rssFunction = self.__computeObjectiveFunction(centroids,classified)
# Insert in output Dictionary if key doesnt exist or Update the Output Dictionary if Objective Function decreases
if (k not in self.__output.keys() or self.__output[k][1] > rssFunction):
self.__output[k] = (centroids, rssFunction, classified)
def __iterationHandler(self):
''' Function to Handle all iterations over a single value of K '''
threads = []
# Starting all iterations with Random Iterations Simultaneously
for i in range(self.__iter):
t = threading.Thread(target = self.__runIteration,args = (self.__k,))
t.start()
threads.append(t)
# Waiting For the Threads to end
for i in range(self.__iter):
threads[i].join()
def classify(self):
''' Function which acts as an interface between the user and the class '''
if self.__predefinedK:
self.__iterationHandler()
self.__elbowPoint = self.__k
else:
# If K is not Predefined then make the iterations until K is less than 2*root(Training Examples / 2). Just an Heuristic
while self.__k <= 15:
print(self.__k)
self.__iterationHandler()
self.__k += 1
self.__showGraphObjectiveFuntion2D()
# Write Centroid of the Elbow Point to the CSV
self.__output[self.__elbowPoint][0].to_csv(self.__outputPath + 'centroid.csv',index=False)
# Show the graph changing if no of features are 2 or 3
if self.__n == 2:
self.__showGraph2D(self.__output[self.__elbowPoint][0],self.__output[self.__elbowPoint][2],self.__elbowPoint)
if self.__n == 3:
self.__showGraph3D(self.__output[self.__elbowPoint][0],self.__output[self.__elbowPoint][2],self.__elbowPoint)
if __name__ == '__main__':
start_time = time.time()
kMeansClassifier = KMeans('House Images.csv',iterations = 3)
kMeansClassifier.classify()
print(str(time.time() - start_time) + " sec")