diff --git a/machine_learning/k_means_clust.py b/machine_learning/k_means_clust.py new file mode 100644 index 000000000000..c19832726aef --- /dev/null +++ b/machine_learning/k_means_clust.py @@ -0,0 +1,172 @@ +'''README, Author - Anurag Kumar(mailto:anuragkumarak95@gmail.com) + +Requirements: + - sklearn + - numpy + - matplotlib + +Python: + - 3.5 + +Inputs: + - X , a 2D numpy array of features. + - k , number of clusters to create. + - initial_centroids , initial centroid values generated by utility function(mentioned in usage). + - maxiter , maximum number of iterations to process. + - heterogeneity , empty list that will be filled with hetrogeneity values if passed to kmeans func. + +Usage: + 1. define 'k' value, 'X' features array and 'hetrogeneity' empty list + + 2. create initial_centroids, + initial_centroids = get_initial_centroids( + X, + k, + seed=0 # seed value for initial centroid generation, None for randomness(default=None) + ) + + 3. find centroids and clusters using kmeans function. + + centroids, cluster_assignment = kmeans( + X, + k, + initial_centroids, + maxiter=400, + record_heterogeneity=heterogeneity, + verbose=True # whether to print logs in console or not.(default=False) + ) + + + 4. Plot the loss function, hetrogeneity values for every iteration saved in hetrogeneity list. + plot_heterogeneity( + heterogeneity, + k + ) + + 5. Have fun.. + +''' +from sklearn.metrics import pairwise_distances +import numpy as np + +TAG = 'K-MEANS-CLUST/ ' + +def get_initial_centroids(data, k, seed=None): + '''Randomly choose k data points as initial centroids''' + if seed is not None: # useful for obtaining consistent results + np.random.seed(seed) + n = data.shape[0] # number of data points + + # Pick K indices from range [0, N). + rand_indices = np.random.randint(0, n, k) + + # Keep centroids as dense format, as many entries will be nonzero due to averaging. + # As long as at least one document in a cluster contains a word, + # it will carry a nonzero weight in the TF-IDF vector of the centroid. + centroids = data[rand_indices,:] + + return centroids + +def centroid_pairwise_dist(X,centroids): + return pairwise_distances(X,centroids,metric='euclidean') + +def assign_clusters(data, centroids): + + # Compute distances between each data point and the set of centroids: + # Fill in the blank (RHS only) + distances_from_centroids = centroid_pairwise_dist(data,centroids) + + # Compute cluster assignments for each data point: + # Fill in the blank (RHS only) + cluster_assignment = np.argmin(distances_from_centroids,axis=1) + + return cluster_assignment + +def revise_centroids(data, k, cluster_assignment): + new_centroids = [] + for i in range(k): + # Select all data points that belong to cluster i. Fill in the blank (RHS only) + member_data_points = data[cluster_assignment==i] + # Compute the mean of the data points. Fill in the blank (RHS only) + centroid = member_data_points.mean(axis=0) + new_centroids.append(centroid) + new_centroids = np.array(new_centroids) + + return new_centroids + +def compute_heterogeneity(data, k, centroids, cluster_assignment): + + heterogeneity = 0.0 + for i in range(k): + + # Select all data points that belong to cluster i. Fill in the blank (RHS only) + member_data_points = data[cluster_assignment==i, :] + + if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty + # Compute distances from centroid to data points (RHS only) + distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean') + squared_distances = distances**2 + heterogeneity += np.sum(squared_distances) + + return heterogeneity + +from matplotlib import pyplot as plt +def plot_heterogeneity(heterogeneity, k): + plt.figure(figsize=(7,4)) + plt.plot(heterogeneity, linewidth=4) + plt.xlabel('# Iterations') + plt.ylabel('Heterogeneity') + plt.title('Heterogeneity of clustering over time, K={0:d}'.format(k)) + plt.rcParams.update({'font.size': 16}) + plt.show() + +def kmeans(data, k, initial_centroids, maxiter=500, record_heterogeneity=None, verbose=False): + '''This function runs k-means on given data and initial set of centroids. + maxiter: maximum number of iterations to run.(default=500) + record_heterogeneity: (optional) a list, to store the history of heterogeneity as function of iterations + if None, do not store the history. + verbose: if True, print how many data points changed their cluster labels in each iteration''' + centroids = initial_centroids[:] + prev_cluster_assignment = None + + for itr in range(maxiter): + if verbose: + print(itr, end='') + + # 1. Make cluster assignments using nearest centroids + cluster_assignment = assign_clusters(data,centroids) + + # 2. Compute a new centroid for each of the k clusters, averaging all data points assigned to that cluster. + centroids = revise_centroids(data,k, cluster_assignment) + + # Check for convergence: if none of the assignments changed, stop + if prev_cluster_assignment is not None and \ + (prev_cluster_assignment==cluster_assignment).all(): + break + + # Print number of new assignments + if prev_cluster_assignment is not None: + num_changed = np.sum(prev_cluster_assignment!=cluster_assignment) + if verbose: + print(' {0:5d} elements changed their cluster assignment.'.format(num_changed)) + + # Record heterogeneity convergence metric + if record_heterogeneity is not None: + # YOUR CODE HERE + score = compute_heterogeneity(data,k,centroids,cluster_assignment) + record_heterogeneity.append(score) + + prev_cluster_assignment = cluster_assignment[:] + + return centroids, cluster_assignment + +# Mock test below +if False: # change to true to run this test case. + import sklearn.datasets as ds + dataset = ds.load_iris() + k = 3 + heterogeneity = [] + initial_centroids = get_initial_centroids(dataset['data'], k, seed=0) + centroids, cluster_assignment = kmeans(dataset['data'], k, initial_centroids, maxiter=400, + record_heterogeneity=heterogeneity, verbose=True) + plot_heterogeneity(heterogeneity, k) \ No newline at end of file