python 3

lazyprogrammer · lazyprogrammer · commit 4c6ffb1779f4 · 2017-12-25T19:45:11.000-05:00
diff --git a/unsupervised_class/books.py b/unsupervised_class/books.py
@@ -1,5 +1,12 @@
 # https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
 # https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import networkx as nx
 import nltk
 import numpy as np
@@ -38,9 +45,12 @@ def my_tokenizer(s):
 all_tokens = []
 all_titles = []
 index_word_map = []
+print("num titles:", len(titles))
+print("first title:", titles[0])
 for title in titles:
     try:
         title = title.encode('ascii', 'ignore') # this will throw exception if bad characters
+        title = title.decode('utf-8')
         all_titles.append(title)
         tokens = my_tokenizer(title)
         all_tokens.append(tokens)
@@ -49,8 +59,8 @@ def my_tokenizer(s):
                 word_index_map[token] = current_index
                 current_index += 1
                 index_word_map.append(token)
-    except:
-        pass
+    except Exception as e:
+        print(e)
 
 
 
@@ -76,9 +86,9 @@ def d(u, v):
 
 def cost(X, R, M):
     cost = 0
-    for k in xrange(len(M)):
+    for k in range(len(M)):
         # method 1
-        # for n in xrange(len(X)):
+        # for n in range(len(X)):
         #     cost += R[n,k]*d(M[k], X[n])
 
         # method 2
@@ -94,22 +104,22 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
     exponents = np.empty((N, K))
 
     # initialize M to random
-    for k in xrange(K):
+    for k in range(K):
         M[k] = X[np.random.choice(N)]
 
     costs = np.zeros(max_iter)
-    for i in xrange(max_iter):
+    for i in range(max_iter):
         # step 1: determine assignments / resposibilities
         # is this inefficient?
-        for k in xrange(K):
-            for n in xrange(N):
-                # R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in xrange(K) )
+        for k in range(K):
+            for n in range(N):
+                # R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in range(K) )
                 exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
 
         R = exponents / exponents.sum(axis=1, keepdims=True)
 
         # step 2: recalculate means
-        for k in xrange(K):
+        for k in range(K):
             M[k] = R[:,k].dot(X) / R[:,k].sum()
 
         costs[i] = cost(X, R, M)
@@ -135,16 +145,16 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
     hard_responsibilities = np.argmax(R, axis=1) # is an N-size array of cluster identities
     # let's "reverse" the order so it's cluster identity -> word index
     cluster2word = {}
-    for i in xrange(len(hard_responsibilities)):
+    for i in range(len(hard_responsibilities)):
       word = index_word_map[i]
       cluster = hard_responsibilities[i]
       if cluster not in cluster2word:
         cluster2word[cluster] = []
       cluster2word[cluster].append(word)
 
     # print out the words grouped by cluster
-    for cluster, wordlist in cluster2word.iteritems():
-      print "cluster", cluster, "->", wordlist
+    for cluster, wordlist in cluster2word.items():
+      print("cluster", cluster, "->", wordlist)
 
     return M, R
 
@@ -155,7 +165,7 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
 #   G = nx.DiGraph()
 #   data_nodes = []
 #   init_pos = {}
-#   for i in xrange(N):
+#   for i in range(N):
 #     x, y = X[i]
 #     label = index_word_map[i]
 #     data_str = 'data_{0}'.format(label)
@@ -197,15 +207,15 @@ def plot_k_means(X, K, index_word_map, max_iter=20, beta=1.0, show_plots=True):
 def annotate1(X, index_word_map, eps=0.1):
   N, D = X.shape
   placed = np.empty((N, D))
-  for i in xrange(N):
+  for i in range(N):
     x, y = X[i]
 
     # if x, y is too close to something already plotted, move it
     close = []
 
     x, y = X[i]
-    for retry in xrange(3):
-      for j in xrange(i):
+    for retry in range(3):
+      for j in range(i):
         diff = np.array([x, y]) - placed[j]
 
         # if something is close, append it to the close list
@@ -233,11 +243,11 @@ def annotate1(X, index_word_map, eps=0.1):
       }
     )
 
-print "vocab size:", current_index
+print("vocab size:", current_index)
 
 transformer = TfidfTransformer()
 X = transformer.fit_transform(X).toarray()
 
 reducer = TSNE()
 Z = reducer.fit_transform(X)
-plot_k_means(Z[:,:2], current_index/10, index_word_map, show_plots=True)
+plot_k_means(Z[:,:2], current_index//10, index_word_map, show_plots=True)
diff --git a/unsupervised_class/choose_k.py b/unsupervised_class/choose_k.py
@@ -1,5 +1,12 @@
 # https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
 # https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 from kmeans import plot_k_means, get_simple_data, cost
@@ -13,7 +20,7 @@ def main():
 
   costs = np.empty(10)
   costs[0] = None
-  for k in xrange(1, 10):
+  for k in range(1, 10):
     M, R = plot_k_means(X, k, show_plots=False)
     c = cost(X, R, M)
     costs[k] = c
diff --git a/unsupervised_class/gmm.py b/unsupervised_class/gmm.py
@@ -1,46 +1,53 @@
 # https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
 # https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 
 from scipy.stats import multivariate_normal
 
 
-def gmm(X, K, max_iter=20, smoothing=10e-3):
+def gmm(X, K, max_iter=20, smoothing=1e-2):
     N, D = X.shape
     M = np.zeros((K, D))
     R = np.zeros((N, K))
     C = np.zeros((K, D, D))
     pi = np.ones(K) / K # uniform
 
     # initialize M to random, initialize C to spherical with variance 1
-    for k in xrange(K):
+    for k in range(K):
         M[k] = X[np.random.choice(N)]
         C[k] = np.eye(D)
 
     costs = np.zeros(max_iter)
     weighted_pdfs = np.zeros((N, K)) # we'll use these to store the PDF value of sample n and Gaussian k
-    for i in xrange(max_iter):
+    for i in range(max_iter):
         # step 1: determine assignments / resposibilities
-        for k in xrange(K):
-            for n in xrange(N):
+        for k in range(K):
+            for n in range(N):
                 weighted_pdfs[n,k] = pi[k]*multivariate_normal.pdf(X[n], M[k], C[k])
 
-        for k in xrange(K):
-            for n in xrange(N):
+        for k in range(K):
+            for n in range(N):
                 R[n,k] = weighted_pdfs[n,k] / weighted_pdfs[n,:].sum()
 
         # a faster way to do step 1: "vectorization"
-        # for k in xrange(K):
+        # for k in range(K):
         #     weighted_pdfs[:,k] = pi[k]*multivariate_normal.pdf(X, M[k], C[k])
         # R = weighted_pdfs / weighted_pdfs.sum(axis=1, keepdims=True)
 
         # step 2: recalculate params
-        for k in xrange(K):
+        for k in range(K):
             Nk = R[:,k].sum()
             pi[k] = Nk / N
             M[k] = R[:,k].dot(X) / Nk
-            C[k] = np.sum(R[n,k]*np.outer(X[n] - M[k], X[n] - M[k]) for n in xrange(N)) / Nk + np.eye(D)*smoothing
+            C[k] = np.sum(R[n,k]*np.outer(X[n] - M[k], X[n] - M[k]) for n in range(N)) / Nk + np.eye(D)*smoothing
 
 
         costs[i] = np.log(weighted_pdfs.sum(axis=1)).sum()
@@ -57,9 +64,9 @@ def gmm(X, K, max_iter=20, smoothing=10e-3):
     plt.scatter(X[:,0], X[:,1], c=colors)
     plt.show()
 
-    print "pi:", pi
-    print "means:", M
-    print "covariances:", C
+    print("pi:", pi)
+    print("means:", M)
+    print("covariances:", C)
     return R
 
 
diff --git a/unsupervised_class/gmm_mnist.py b/unsupervised_class/gmm_mnist.py
@@ -5,6 +5,12 @@
 # each image is a D = 28x28 = 784 dimensional vector
 # there are N = 42000 samples
 # you can plot an image by reshaping to (28,28) and using plt.imshow()
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 
 import numpy as np
 import pandas as pd
@@ -18,15 +24,15 @@
 
 def main():
     X, Y = get_data(10000)
-    print "Number of data points:", len(Y)
+    print("Number of data points:", len(Y))
 
     model = GaussianMixture(n_components=10)
     model.fit(X)
     M = model.means_
     R = model.predict_proba(X)
 
-    print "Purity:", purity(Y, R) # max is 1, higher is better
-    print "DBI:", DBI(X, M, R) # lower is better
+    print("Purity:", purity(Y, R)) # max is 1, higher is better
+    print("DBI:", DBI(X, M, R)) # lower is better
 
 
 if __name__ == "__main__":
diff --git a/unsupervised_class/hcluster.py b/unsupervised_class/hcluster.py
@@ -1,5 +1,12 @@
 # https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
 # https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -20,7 +27,7 @@ def main():
     X[600:, :] = np.random.randn(300, D) + mu3
 
     Z = linkage(X, 'ward')
-    print "Z.shape:", Z.shape
+    print("Z.shape:", Z.shape)
     # Z has the format [idx1, idx2, dist, sample_count]
     # therefore, its size will be (N-1, 4)
 
diff --git a/unsupervised_class/kmeans.py b/unsupervised_class/kmeans.py
@@ -1,5 +1,12 @@
 # https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
 # https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -11,9 +18,9 @@ def d(u, v):
 
 def cost(X, R, M):
     cost = 0
-    for k in xrange(len(M)):
+    for k in range(len(M)):
         # method 1
-        # for n in xrange(len(X)):
+        # for n in range(len(X)):
         #     cost += R[n,k]*d(M[k], X[n])
 
         # method 2
@@ -30,28 +37,28 @@ def plot_k_means(X, K, max_iter=20, beta=1.0, show_plots=True):
     exponents = np.empty((N, K))
 
     # initialize M to random
-    for k in xrange(K):
+    for k in range(K):
         M[k] = X[np.random.choice(N)]
 
     costs = np.zeros(max_iter)
-    for i in xrange(max_iter):
+    for i in range(max_iter):
         # step 1: determine assignments / resposibilities
         # is this inefficient?
-        for k in xrange(K):
-            for n in xrange(N):
-                # R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in xrange(K) )
+        for k in range(K):
+            for n in range(N):
+                # R[n,k] = np.exp(-beta*d(M[k], X[n])) / np.sum( np.exp(-beta*d(M[j], X[n])) for j in range(K) )
                 exponents[n,k] = np.exp(-beta*d(M[k], X[n]))
 
         R = exponents / exponents.sum(axis=1, keepdims=True)
         # assert(np.abs(R - R2).sum() < 10e-10)
 
         # step 2: recalculate means
-        for k in xrange(K):
+        for k in range(K):
             M[k] = R[:,k].dot(X) / R[:,k].sum()
 
         costs[i] = cost(X, R, M)
         if i > 0:
-            if np.abs(costs[i] - costs[i-1]) < 10e-5:
+            if np.abs(costs[i] - costs[i-1]) < 1e-5:
                 break
 
     if show_plots:
diff --git a/unsupervised_class/kmeans_fail.py b/unsupervised_class/kmeans_fail.py
@@ -1,5 +1,12 @@
 # https://deeplearningcourses.com/c/cluster-analysis-unsupervised-machine-learning-python
 # https://www.udemy.com/cluster-analysis-unsupervised-machine-learning-python
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range, input
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import numpy as np
 from kmeans import plot_k_means
 
@@ -13,12 +20,12 @@ def donut():
 
     # distance from origin is radius + random normal
     # angle theta is uniformly distributed between (0, 2pi)
-    R1 = np.random.randn(N/2) + R_inner
-    theta = 2*np.pi*np.random.random(N/2)
+    R1 = np.random.randn(N//2) + R_inner
+    theta = 2*np.pi*np.random.random(N//2)
     X_inner = np.concatenate([[R1 * np.cos(theta)], [R1 * np.sin(theta)]]).T
 
-    R2 = np.random.randn(N/2) + R_outer
-    theta = 2*np.pi*np.random.random(N/2)
+    R2 = np.random.randn(N//2) + R_outer
+    theta = 2*np.pi*np.random.random(N//2)
     X_outer = np.concatenate([[R2 * np.cos(theta)], [R2 * np.sin(theta)]]).T
 
     X = np.concatenate([ X_inner, X_outer ])
diff --git a/unsupervised_class/kmeans_mnist.py b/unsupervised_class/kmeans_mnist.py
diff --git a/unsupervised_class/kmeans_visualize.py b/unsupervised_class/kmeans_visualize.py