Skip to content

Commit 0f1e28b

Browse files
author
Lazy Programmer
committed
py3
1 parent f1bdc41 commit 0f1e28b

File tree

4 files changed

+51
-18
lines changed

4 files changed

+51
-18
lines changed

nlp_class/article_spinner.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55
# Author: http://lazyprogrammer.me
66

77
# A very bad article spinner using trigrams.
8+
from __future__ import print_function, division
9+
from future.utils import iteritems
10+
from builtins import range
11+
# Note: you may need to update your version of future
12+
# sudo pip install -U future
13+
14+
815
import nltk
916
import random
1017
import numpy as np
@@ -24,16 +31,15 @@
2431
for review in positive_reviews:
2532
s = review.text.lower()
2633
tokens = nltk.tokenize.word_tokenize(s)
27-
for i in xrange(len(tokens) - 2):
34+
for i in range(len(tokens) - 2):
2835
k = (tokens[i], tokens[i+2])
2936
if k not in trigrams:
3037
trigrams[k] = []
3138
trigrams[k].append(tokens[i+1])
3239

33-
3440
# turn each array of middle-words into a probability vector
3541
trigram_probabilities = {}
36-
for k, words in trigrams.iteritems():
42+
for k, words in iteritems(trigrams):
3743
# create a dictionary of word -> count
3844
if len(set(words)) > 1:
3945
# only do this when there are different possibilities for a middle word
@@ -44,7 +50,7 @@
4450
d[w] = 0
4551
d[w] += 1
4652
n += 1
47-
for w, c in d.iteritems():
53+
for w, c in iteritems(d):
4854
d[w] = float(c) / n
4955
trigram_probabilities[k] = d
5056

@@ -53,7 +59,7 @@ def random_sample(d):
5359
# choose a random sample from dictionary where values are the probabilities
5460
r = random.random()
5561
cumulative = 0
56-
for w, p in d.iteritems():
62+
for w, p in iteritems(d):
5763
cumulative += p
5864
if r < cumulative:
5965
return w
@@ -62,14 +68,17 @@ def random_sample(d):
6268
def test_spinner():
6369
review = random.choice(positive_reviews)
6470
s = review.text.lower()
65-
print "Original:", s
71+
print("Original:", s)
6672
tokens = nltk.tokenize.word_tokenize(s)
67-
for i in xrange(len(tokens) - 2):
73+
for i in range(len(tokens) - 2):
6874
if random.random() < 0.2: # 20% chance of replacement
6975
k = (tokens[i], tokens[i+2])
7076
if k in trigram_probabilities:
7177
w = random_sample(trigram_probabilities[k])
7278
tokens[i+1] = w
73-
print "Spun:"
74-
print " ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!")
79+
print("Spun:")
80+
print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))
81+
7582

83+
if __name__ == '__main__':
84+
test_spinner()

nlp_class/lsa.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
# https://www.udemy.com/data-science-natural-language-processing-in-python
44

55
# Author: http://lazyprogrammer.me
6+
from __future__ import print_function, division
7+
from builtins import range
8+
# Note: you may need to update your version of future
9+
# sudo pip install -U future
10+
611

712
import nltk
813
import numpy as np
@@ -40,9 +45,10 @@ def my_tokenizer(s):
4045
all_tokens = []
4146
all_titles = []
4247
index_word_map = []
48+
error_count = 0
4349
for title in titles:
4450
try:
45-
title = title.encode('ascii', 'ignore') # this will throw exception if bad characters
51+
title = title.encode('ascii', 'ignore').decode('utf-8') # this will throw exception if bad characters
4652
all_titles.append(title)
4753
tokens = my_tokenizer(title)
4854
all_tokens.append(tokens)
@@ -51,9 +57,16 @@ def my_tokenizer(s):
5157
word_index_map[token] = current_index
5258
current_index += 1
5359
index_word_map.append(token)
54-
except:
55-
pass
60+
except Exception as e:
61+
print(e)
62+
print(title)
63+
error_count += 1
64+
5665

66+
print("Number of errors parsing file:", error_count, "number of lines in file:", len(titles))
67+
if error_count == len(titles):
68+
print("There is no data to do anything with! Quitting...")
69+
exit()
5770

5871

5972
# now let's create our input matrices - just indicator variables for this example - works better than proportions
@@ -76,7 +89,7 @@ def main():
7689
svd = TruncatedSVD()
7790
Z = svd.fit_transform(X)
7891
plt.scatter(Z[:,0], Z[:,1])
79-
for i in xrange(D):
92+
for i in range(D):
8093
plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
8194
plt.show()
8295

nlp_class/nb.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
# dataset: https://archive.ics.uci.edu/ml/datasets/Spambase
55

66
# Author: http://lazyprogrammer.me
7+
from __future__ import print_function, division
8+
from builtins import range
9+
# Note: you may need to update your version of future
10+
# sudo pip install -U future
11+
712

813
from sklearn.naive_bayes import MultinomialNB
914
import pandas as pd
@@ -27,7 +32,7 @@
2732

2833
model = MultinomialNB()
2934
model.fit(Xtrain, Ytrain)
30-
print "Classification rate for NB:", model.score(Xtest, Ytest)
35+
print("Classification rate for NB:", model.score(Xtest, Ytest))
3136

3237

3338

@@ -36,4 +41,4 @@
3641

3742
model = AdaBoostClassifier()
3843
model.fit(Xtrain, Ytrain)
39-
print "Classification rate for AdaBoost:", model.score(Xtest, Ytest)
44+
print("Classification rate for AdaBoost:", model.score(Xtest, Ytest))

nlp_class/sentiment.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
# i.e. It is not optimized for anything.
88

99
# Author: http://lazyprogrammer.me
10+
from __future__ import print_function, division
11+
from future.utils import iteritems
12+
from builtins import range
13+
# Note: you may need to update your version of future
14+
# sudo pip install -U future
15+
1016

1117
import nltk
1218
import numpy as np
@@ -117,13 +123,13 @@ def tokens_to_vector(tokens, label):
117123

118124
model = LogisticRegression()
119125
model.fit(Xtrain, Ytrain)
120-
print "Classification rate:", model.score(Xtest, Ytest)
126+
print("Classification rate:", model.score(Xtest, Ytest))
121127

122128

123129
# let's look at the weights for each word
124130
# try it with different threshold values!
125131
threshold = 0.5
126-
for word, index in word_index_map.iteritems():
132+
for word, index in iteritems(word_index_map):
127133
weight = model.coef_[0][index]
128134
if weight > threshold or weight < -threshold:
129-
print word, weight
135+
print(word, weight)

0 commit comments

Comments
 (0)