forked from armorleon/ProgrammerGuidToDataMining
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bayesText.py
158 lines (145 loc) · 6 KB
/
bayesText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from __future__ import print_function
import os, codecs, math
class BayesText:
def __init__(self, trainingdir, stopwordlist):
"""This class implements a naive Bayes approach to text
classification
trainingdir is the training data. Each subdirectory of
trainingdir is titled with the name of the classification
category -- those subdirectories in turn contain the text
files for that category.
The stopwordlist is a list of words (one per line) will be
removed before any counting takes place.
"""
self.vocabulary = {}
self.prob = {}
self.totals = {}
self.stopwords = {}
f = open(stopwordlist)
for line in f:
self.stopwords[line.strip()] = 1
f.close()
categories = os.listdir(trainingdir)
#filter out files that are not directories
self.categories = [filename for filename in categories
if os.path.isdir(trainingdir + filename)]
print("Counting ...")
for category in self.categories:
print(' ' + category)
(self.prob[category],
self.totals[category]) = self.train(trainingdir, category)
# I am going to eliminate any word in the vocabulary
# that doesn't occur at least 3 times
toDelete = []
for word in self.vocabulary:
if self.vocabulary[word] < 3:
# mark word for deletion
# can't delete now because you can't delete
# from a list you are currently iterating over
toDelete.append(word)
# now delete
for word in toDelete:
del self.vocabulary[word]
# now compute probabilities
vocabLength = len(self.vocabulary)
print("Computing probabilities:")
for category in self.categories:
print(' ' + category)
denominator = self.totals[category] + vocabLength
for word in self.vocabulary:
if word in self.prob[category]:
count = self.prob[category][word]
else:
count = 1
self.prob[category][word] = (float(count + 1)
/ denominator)
print ("DONE TRAINING\n\n")
def train(self, trainingdir, category):
"""counts word occurrences for a particular category"""
currentdir = trainingdir + category
files = os.listdir(currentdir)
counts = {}
total = 0
for file in files:
#print(currentdir + '/' + file)
f = codecs.open(currentdir + '/' + file, 'r', 'iso8859-1')
for line in f:
tokens = line.split()
for token in tokens:
# get rid of punctuation and lowercase token
token = token.strip('\'".,?:-')
token = token.lower()
if token != '' and not token in self.stopwords:
self.vocabulary.setdefault(token, 0)
self.vocabulary[token] += 1
counts.setdefault(token, 0)
counts[token] += 1
total += 1
f.close()
return(counts, total)
def classify(self, filename):
results = {}
for category in self.categories:
results[category] = 0
f = codecs.open(filename, 'r', 'iso8859-1')
for line in f:
tokens = line.split()
for token in tokens:
#print(token)
token = token.strip('\'".,?:-').lower()
if token in self.vocabulary:
for category in self.categories:
if self.prob[category][token] == 0:
print("%s %s" % (category, token))
results[category] += math.log(
self.prob[category][token])
f.close()
results = list(results.items())
results.sort(key=lambda tuple: tuple[1], reverse = True)
# for debugging I can change this to give me the entire list
return results[0][0]
def testCategory(self, directory, category):
files = os.listdir(directory)
total = 0
correct = 0
for file in files:
total += 1
result = self.classify(directory + file)
if result == category:
correct += 1
return (correct, total)
def test(self, testdir):
"""Test all files in the test directory--that directory is
organized into subdirectories--each subdir is a classification
category"""
categories = os.listdir(testdir)
#filter out files that are not directories
categories = [filename for filename in categories if
os.path.isdir(testdir + filename)]
correct = 0
total = 0
for category in categories:
print(".", end="")
(catCorrect, catTotal) = self.testCategory(
testdir + category + '/', category)
correct += catCorrect
total += catTotal
print("\n\nAccuracy is %f%% (%i test instances)" %
((float(correct) / total) * 100, total))
# change these to match your directory structure
baseDirectory = "/Users/raz/Dropbox/guide/data/20news-bydate/"
trainingDir = baseDirectory + "20news-bydate-train/"
testDir = baseDirectory + "20news-bydate-test/"
stoplistfile = "/Users/raz/Downloads/20news-bydate/stopwords0.txt"
print("Reg stoplist 0 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords0.txt")
print("Running Test ...")
bT.test(testDir)
print("\n\nReg stoplist 25 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords25.txt")
print("Running Test ...")
bT.test(testDir)
print("\n\nReg stoplist 174 ")
bT = BayesText(trainingDir, baseDirectory + "stopwords174.txt")
print("Running Test ...")
bT.test(testDir)