1
1
# https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
2
2
# https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
3
+ from __future__ import print_function , division
4
+ from future .utils import iteritems
5
+ from builtins import range , input
6
+ # Note: you may need to update your version of future
7
+ # sudo pip install -U future
8
+
9
+
3
10
import networkx as nx
4
11
import nltk
5
12
import numpy as np
@@ -38,9 +45,12 @@ def my_tokenizer(s):
38
45
all_tokens = []
39
46
all_titles = []
40
47
index_word_map = []
48
+ print ("num titles:" , len (titles ))
49
+ print ("first title:" , titles [0 ])
41
50
for title in titles :
42
51
try :
43
52
title = title .encode ('ascii' , 'ignore' ) # this will throw exception if bad characters
53
+ title = title .decode ('utf-8' )
44
54
all_titles .append (title )
45
55
tokens = my_tokenizer (title )
46
56
all_tokens .append (tokens )
@@ -49,8 +59,8 @@ def my_tokenizer(s):
49
59
word_index_map [token ] = current_index
50
60
current_index += 1
51
61
index_word_map .append (token )
52
- except :
53
- pass
62
+ except Exception as e :
63
+ print ( e )
54
64
55
65
56
66
@@ -76,9 +86,9 @@ def d(u, v):
76
86
77
87
def cost (X , R , M ):
78
88
cost = 0
79
- for k in xrange (len (M )):
89
+ for k in range (len (M )):
80
90
# method 1
81
- # for n in xrange (len(X)):
91
+ # for n in range (len(X)):
82
92
# cost += R[n,k]*d(M[k], X[n])
83
93
84
94
# method 2
@@ -94,22 +104,22 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
94
104
exponents = np .empty ((N , K ))
95
105
96
106
# initialize M to random
97
- for k in xrange (K ):
107
+ for k in range (K ):
98
108
M [k ] = X [np .random .choice (N )]
99
109
100
110
costs = np .zeros (max_iter )
101
- for i in xrange (max_iter ):
111
+ for i in range (max_iter ):
102
112
# step 1: determine assignments / resposibilities
103
113
# is this inefficient?
104
- for k in xrange (K ):
105
- for n in xrange (N ):
106
- # R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in xrange (K) )
114
+ for k in range (K ):
115
+ for n in range (N ):
116
+ # R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in range (K) )
107
117
exponents [n ,k ] = np .exp (- beta * d (M [k ], X [n ]))
108
118
109
119
R = exponents / exponents .sum (axis = 1 , keepdims = True )
110
120
111
121
# step 2: recalculate means
112
- for k in xrange (K ):
122
+ for k in range (K ):
113
123
M [k ] = R [:,k ].dot (X ) / R [:,k ].sum ()
114
124
115
125
costs [i ] = cost (X , R , M )
@@ -135,16 +145,16 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
135
145
hard_responsibilities = np .argmax (R , axis = 1 ) # is an N-size array of cluster identities
136
146
# let's "reverse" the order so it's cluster identity -> word index
137
147
cluster2word = {}
138
- for i in xrange (len (hard_responsibilities )):
148
+ for i in range (len (hard_responsibilities )):
139
149
word = index_word_map [i ]
140
150
cluster = hard_responsibilities [i ]
141
151
if cluster not in cluster2word :
142
152
cluster2word [cluster ] = []
143
153
cluster2word [cluster ].append (word )
144
154
145
155
# print out the words grouped by cluster
146
- for cluster , wordlist in cluster2word .iteritems ():
147
- print "cluster" , cluster , "->" , wordlist
156
+ for cluster , wordlist in cluster2word .items ():
157
+ print ( "cluster" , cluster , "->" , wordlist )
148
158
149
159
return M , R
150
160
@@ -155,7 +165,7 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
155
165
# G = nx.DiGraph()
156
166
# data_nodes = []
157
167
# init_pos = {}
158
- # for i in xrange (N):
168
+ # for i in range (N):
159
169
# x, y = X[i]
160
170
# label = index_word_map[i]
161
171
# data_str = 'data_{0}'.format(label)
@@ -197,15 +207,15 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
197
207
def annotate1 (X , index_word_map , eps = 0.1 ):
198
208
N , D = X .shape
199
209
placed = np .empty ((N , D ))
200
- for i in xrange (N ):
210
+ for i in range (N ):
201
211
x , y = X [i ]
202
212
203
213
# if x, y is too close to something already plotted, move it
204
214
close = []
205
215
206
216
x , y = X [i ]
207
- for retry in xrange (3 ):
208
- for j in xrange (i ):
217
+ for retry in range (3 ):
218
+ for j in range (i ):
209
219
diff = np .array ([x , y ]) - placed [j ]
210
220
211
221
# if something is close, append it to the close list
@@ -233,11 +243,11 @@ def annotate1(X, index_word_map, eps=0.1):
233
243
}
234
244
)
235
245
236
- print "vocab size:" , current_index
246
+ print ( "vocab size:" , current_index )
237
247
238
248
transformer = TfidfTransformer ()
239
249
X = transformer .fit_transform (X ).toarray ()
240
250
241
251
reducer = TSNE ()
242
252
Z = reducer .fit_transform (X )
243
- plot_k_means (Z [:,:2 ], current_index / 10 , index_word_map , show_plots = True )
253
+ plot_k_means (Z [:,:2 ], current_index // 10 , index_word_map , show_plots = True )
0 commit comments