py3

Lazy Programmer · Lazy Programmer · commit 0f1e28b03440 · 2017-12-15T00:25:57.000-05:00
diff --git a/nlp_class/article_spinner.py b/nlp_class/article_spinner.py
@@ -5,6 +5,13 @@
 # Author: http://lazyprogrammer.me
 
 # A very bad article spinner using trigrams.
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
+
 import nltk
 import random
 import numpy as np
@@ -24,16 +31,15 @@
 for review in positive_reviews:
     s = review.text.lower()
     tokens = nltk.tokenize.word_tokenize(s)
-    for i in xrange(len(tokens) - 2):
+    for i in range(len(tokens) - 2):
         k = (tokens[i], tokens[i+2])
         if k not in trigrams:
             trigrams[k] = []
         trigrams[k].append(tokens[i+1])
 
-
 # turn each array of middle-words into a probability vector
 trigram_probabilities = {}
-for k, words in trigrams.iteritems():
+for k, words in iteritems(trigrams):
     # create a dictionary of word -> count
     if len(set(words)) > 1:
         # only do this when there are different possibilities for a middle word
@@ -44,7 +50,7 @@
                 d[w] = 0
             d[w] += 1
             n += 1
-        for w, c in d.iteritems():
+        for w, c in iteritems(d):
             d[w] = float(c) / n
         trigram_probabilities[k] = d
 
@@ -53,7 +59,7 @@ def random_sample(d):
     # choose a random sample from dictionary where values are the probabilities
     r = random.random()
     cumulative = 0
-    for w, p in d.iteritems():
+    for w, p in iteritems(d):
         cumulative += p
         if r < cumulative:
             return w
@@ -62,14 +68,17 @@ def random_sample(d):
 def test_spinner():
     review = random.choice(positive_reviews)
     s = review.text.lower()
-    print "Original:", s
+    print("Original:", s)
     tokens = nltk.tokenize.word_tokenize(s)
-    for i in xrange(len(tokens) - 2):
+    for i in range(len(tokens) - 2):
         if random.random() < 0.2: # 20% chance of replacement
             k = (tokens[i], tokens[i+2])
             if k in trigram_probabilities:
                 w = random_sample(trigram_probabilities[k])
                 tokens[i+1] = w
-    print "Spun:"
-    print " ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!")
+    print("Spun:")
+    print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))
+
 
+if __name__ == '__main__':
+    test_spinner()
diff --git a/nlp_class/lsa.py b/nlp_class/lsa.py
@@ -3,6 +3,11 @@
 # https://www.udemy.com/data-science-natural-language-processing-in-python
 
 # Author: http://lazyprogrammer.me
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 
 import nltk
 import numpy as np
@@ -40,9 +45,10 @@ def my_tokenizer(s):
 all_tokens = []
 all_titles = []
 index_word_map = []
+error_count = 0
 for title in titles:
     try:
-        title = title.encode('ascii', 'ignore') # this will throw exception if bad characters
+        title = title.encode('ascii', 'ignore').decode('utf-8') # this will throw exception if bad characters
         all_titles.append(title)
         tokens = my_tokenizer(title)
         all_tokens.append(tokens)
@@ -51,9 +57,16 @@ def my_tokenizer(s):
                 word_index_map[token] = current_index
                 current_index += 1
                 index_word_map.append(token)
-    except:
-        pass
+    except Exception as e:
+        print(e)
+        print(title)
+        error_count += 1
+
 
+print("Number of errors parsing file:", error_count, "number of lines in file:", len(titles))
+if error_count == len(titles):
+    print("There is no data to do anything with! Quitting...")
+    exit()
 
 
 # now let's create our input matrices - just indicator variables for this example - works better than proportions
@@ -76,7 +89,7 @@ def main():
     svd = TruncatedSVD()
     Z = svd.fit_transform(X)
     plt.scatter(Z[:,0], Z[:,1])
-    for i in xrange(D):
+    for i in range(D):
         plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
     plt.show()
 
diff --git a/nlp_class/nb.py b/nlp_class/nb.py
@@ -4,6 +4,11 @@
 # dataset: https://archive.ics.uci.edu/ml/datasets/Spambase
 
 # Author: http://lazyprogrammer.me
+from __future__ import print_function, division
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 
 from sklearn.naive_bayes import MultinomialNB
 import pandas as pd
@@ -27,7 +32,7 @@
 
 model = MultinomialNB()
 model.fit(Xtrain, Ytrain)
-print "Classification rate for NB:", model.score(Xtest, Ytest)
+print("Classification rate for NB:", model.score(Xtest, Ytest))
 
 
 
@@ -36,4 +41,4 @@
 
 model = AdaBoostClassifier()
 model.fit(Xtrain, Ytrain)
-print "Classification rate for AdaBoost:", model.score(Xtest, Ytest)
+print("Classification rate for AdaBoost:", model.score(Xtest, Ytest))
diff --git a/nlp_class/sentiment.py b/nlp_class/sentiment.py
@@ -7,6 +7,12 @@
 # i.e. It is not optimized for anything.
 
 # Author: http://lazyprogrammer.me
+from __future__ import print_function, division
+from future.utils import iteritems
+from builtins import range
+# Note: you may need to update your version of future
+# sudo pip install -U future
+
 
 import nltk
 import numpy as np
@@ -117,13 +123,13 @@ def tokens_to_vector(tokens, label):
 
 model = LogisticRegression()
 model.fit(Xtrain, Ytrain)
-print "Classification rate:", model.score(Xtest, Ytest)
+print("Classification rate:", model.score(Xtest, Ytest))
 
 
 # let's look at the weights for each word
 # try it with different threshold values!
 threshold = 0.5
-for word, index in word_index_map.iteritems():
+for word, index in iteritems(word_index_map):
     weight = model.coef_[0][index]
     if weight > threshold or weight < -threshold:
-        print word, weight
+        print(word, weight)