-
Notifications
You must be signed in to change notification settings - Fork 189
/
info.py
228 lines (193 loc) · 6.68 KB
/
info.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
from __future__ import division
from math import log, exp
from operator import mul
from collections import Counter
import os
import pylab
import cPickle
class MyDict(dict):
def __getitem__(self, key):
if key in self:
return self.get(key)
return 0
pos = MyDict()
neg = MyDict()
features = set()
totals = [0, 0]
delchars = ''.join(c for c in map(chr, range(128)) if not c.isalnum())
CDATA_FILE = "countdata.pickle"
FDATA_FILE = "reduceddata.pickle"
def negate_sequence(text):
"""
Detects negations and transforms negated words into "not_" form.
"""
negation = False
delims = "?.,!:;"
result = []
words = text.split()
prev = None
pprev = None
for word in words:
# stripped = word.strip(delchars)
stripped = word.strip(delims).lower()
negated = "not_" + stripped if negation else stripped
result.append(negated)
if prev:
bigram = prev + " " + negated
result.append(bigram)
if pprev:
trigram = pprev + " " + bigram
result.append(trigram)
pprev = prev
prev = negated
if any(neg in word for neg in ["not", "n't", "no"]):
negation = not negation
if any(c in word for c in delims):
negation = False
return result
def train():
global pos, neg, totals
retrain = False
# Load counts if they already exist.
if not retrain and os.path.isfile(CDATA_FILE):
pos, neg, totals = cPickle.load(open(CDATA_FILE))
return
limit = 12500
for file in os.listdir("./aclImdb/train/pos")[:limit]:
for word in set(negate_sequence(open("./aclImdb/train/pos/" + file).read())):
pos[word] += 1
neg['not_' + word] += 1
for file in os.listdir("./aclImdb/train/neg")[:limit]:
for word in set(negate_sequence(open("./aclImdb/train/neg/" + file).read())):
neg[word] += 1
pos['not_' + word] += 1
prune_features()
totals[0] = sum(pos.values())
totals[1] = sum(neg.values())
countdata = (pos, neg, totals)
cPickle.dump(countdata, open(CDATA_FILE, 'w'))
def classify(text):
words = set(word for word in negate_sequence(text) if word in features)
if (len(words) == 0): return True
# Probability that word occurs in pos documents
pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words)
neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words)
return pos_prob > neg_prob
def classify2(text):
"""
For classification from pretrained data
"""
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0): return True
# Probability that word occurs in pos documents
pos_prob = sum(log((pos[word] + 1) / (2 * totals[0])) for word in words)
neg_prob = sum(log((neg[word] + 1) / (2 * totals[1])) for word in words)
return pos_prob > neg_prob
def classify_demo(text):
words = set(word for word in negate_sequence(text) if word in pos or word in neg)
if (len(words) == 0):
print "No features to compare on"
return True
pprob, nprob = 0, 0
for word in words:
pp = log((pos[word] + 1) / (2 * totals[0]))
np = log((neg[word] + 1) / (2 * totals[1]))
print "%15s %.9f %.9f" % (word, exp(pp), exp(np))
pprob += pp
nprob += np
print ("Positive" if pprob > nprob else "Negative"), "log-diff = %.9f" % abs(pprob - nprob)
def MI(word):
"""
Compute the weighted mutual information of a term.
"""
T = totals[0] + totals[1]
W = pos[word] + neg[word]
I = 0
if W==0:
return 0
if neg[word] > 0:
# doesn't occur in -ve
I += (totals[1] - neg[word]) / T * log ((totals[1] - neg[word]) * T / (T - W) / totals[1])
# occurs in -ve
I += neg[word] / T * log (neg[word] * T / W / totals[1])
if pos[word] > 0:
# doesn't occur in +ve
I += (totals[0] - pos[word]) / T * log ((totals[0] - pos[word]) * T / (T - W) / totals[0])
# occurs in +ve
I += pos[word] / T * log (pos[word] * T / W / totals[0])
return I
def get_relevant_features():
pos_dump = MyDict({k: pos[k] for k in pos if k in features})
neg_dump = MyDict({k: neg[k] for k in neg if k in features})
totals_dump = [sum(pos_dump.values()), sum(neg_dump.values())]
return (pos_dump, neg_dump, totals_dump)
def prune_features():
"""
Remove features that appear only once.
"""
global pos, neg
for k in pos.keys():
if pos[k] <= 1 and neg[k] <= 1:
del pos[k]
for k in neg.keys():
if neg[k] <= 1 and pos[k] <= 1:
del neg[k]
def feature_selection_trials():
"""
Select top k features. Vary k and plot data
"""
global pos, neg, totals, features
retrain = True
if not retrain and os.path.isfile(FDATA_FILE):
pos, neg, totals = cPickle.load(open(FDATA_FILE))
return
words = list(set(pos.keys() + neg.keys()))
print "Total no of features:", len(words)
words.sort(key=lambda w: -MI(w))
num_features, accuracy = [], []
bestk = 0
limit = 500
path = "./aclImdb/test/"
step = 500
start = 20000
best_accuracy = 0.0
for w in words[:start]:
features.add(w)
for k in xrange(start, 40000, step):
for w in words[k:k+step]:
features.add(w)
correct = 0
size = 0
for file in os.listdir(path + "pos")[:limit]:
correct += classify(open(path + "pos/" + file).read()) == True
size += 1
for file in os.listdir(path + "neg")[:limit]:
correct += classify(open(path + "neg/" + file).read()) == False
size += 1
num_features.append(k+step)
accuracy.append(correct / size)
if (correct / size) > best_accuracy:
bestk = k
print k+step, correct / size
features = set(words[:bestk])
cPickle.dump(get_relevant_features(), open(FDATA_FILE, 'w'))
pylab.plot(num_features, accuracy)
pylab.show()
def test_pang_lee():
"""
Tests the Pang Lee dataset
"""
total, correct = 0, 0
for fname in os.listdir("txt_sentoken/pos"):
correct += int(classify2(open("txt_sentoken/pos/" + fname).read()) == True)
total += 1
for fname in os.listdir("txt_sentoken/neg"):
correct += int(classify2(open("txt_sentoken/neg/" + fname).read()) == False)
total += 1
print "accuracy: %f" % (correct / total)
if __name__ == '__main__':
train()
feature_selection_trials()
# test_pang_lee()
# classify_demo(open("pos_example").read())
# classify_demo(open("neg_example").read())