codebox · anchitjain1234 · Sep 21, 2015 · Sep 21, 2015 · Sep 22, 2015 · Sep 22, 2015
diff --git a/classify.py b/classify.py
@@ -2,12 +2,50 @@
 from mode import Mode
 from db import Db
 from words import text_to_list
-
+import operator
+import pdb
 class Classify(Mode):
 	MIN_WORD_COUNT = 5
 	RARE_WORD_PROB = 0.5
 	EXCLUSIVE_WORD_PROB = 0.99
 
+	def top_100_words(self,words,db):
+		pl={}
+		# db=Db()
+		for word in words:
+			spamicity=self.p_for_word(db,word)
+			if(spamicity>0.45 and spamicity<0.55):
+				continue
+			# elif(self.wc_doctype_1(word,db)+self.wc_doctype_2(word,db)<100):
+			# 	continue
+			else:
+				p1 = self.wc_doctype_1(word,db)/self.doctype1_word_count
+				p2 = self.wc_doctype_2(word,db)/self.doctype2_word_count
+				pl[word]=abs(p1-p2)
+		if(len(pl)):
+			sorted_pl=sorted(pl.items(), key=operator.itemgetter(1),reverse=True)
+			# sorted_pl=sorted_pl.reverse()
+			w=[]
+			ct=0
+			for k in sorted_pl:
+				if(ct>=100):
+					break
+				w.append(k[0])
+				ct+=1
+			pdb.set_trace()
+			return w
+		else:
+			return words
+
+	def wc_doctype_1(self,word,db):
+		return db.get_word_count(self.doctype1, word)
+
+	def wc_doctype_2(self,word,db):
+		return db.get_word_count(self.doctype2, word)
+
+	def get_total_wc(self):
+		return self.doctype1_word_count + self.doctype2_word_count
+
 	def set_text(self, text):
 		words = text_to_list(text)
 
@@ -84,6 +122,7 @@ def execute(self):
 		self.doctype1_word_count = db.get_words_count(self.doctype1)
 		self.doctype2_word_count = db.get_words_count(self.doctype2)
 
+		self.words=self.top_100_words(self.words,db)
 		for word in self.words:
 			p = self.p_for_word(db, word)
 			pl.append(p)

diff --git a/learn.py b/learn.py
@@ -2,11 +2,25 @@
 from mode import Mode
 from words import list_to_dict
 from words import text_to_list
-
+import os
 class Learn(Mode):
+	def read_from_dir(self,dirname):
+		fcontents=''
+		for dpath,dnames,fnames in os.walk(dirname):
+			for f in fnames:
+				fcontents+=open(os.path.join(dpath, f), 'r').read()
+		return fcontents
+
+	def get_file_count_from_dir(self,dirname):
+		ct=0
+		for dpath,dnames,fnames in os.walk(dirname):
+			for f in fnames:
+				ct+=1
+		return ct
+
 	def validate(self, args):
 		valid_args = False
-		usage = 'Usage: %s learn <doc type> <file> <count>' % args[0]
+		usage = 'Usage: %s learn <doc type> <file> <count>\n    or %s learn <doc type> <folder>' % (args[0],args[0])
 
 		if len(args) == 5:
 			doc_type = args[2]
@@ -27,6 +41,25 @@ def validate(self, args):
 			self.count = count
 			self.doc_type = doc_type
 
+		elif len(args) == 4:
+			doc_type = args[2]
+
+			file_contents = None
+			try:
+				file_contents = self.read_from_dir(args[3])
+			except Exception as e:
+				raise ValueError(usage + '\nUnable to read specified directory "%s", the error message was: %s' % (args[3], e))
+
+			count = 0
+			try:
+				count = self.get_file_count_from_dir(args[3])
+			except:
+				raise ValueError(usage + '\nUnable to get file count from specified directory "%s" , the error message was: %s' % (args[3], e))			
+
+			self.file_contents = file_contents
+			self.count = count
+			self.doc_type = doc_type
+
 		else:
 			raise ValueError(usage)				
 

diff --git a/words.py b/words.py
@@ -1,10 +1,11 @@
 import re
 from collections import defaultdict
-
+from nltk.stem import WordNetLemmatizer
 commonWords = ('the','be','to','of','and','a','in','that','have','it','is','im','are','was','for','on','with','he','as','you','do','at','this','but','his','by','from','they','we','say','her','she','or','an','will','my','one','all','would','there','their','what','so','up','out','if','about','who','get','which','go','me','when','make','can','like','time','just','him','know','take','person','into','year','your','some','could','them','see','other','than','then','now','look','only','come','its','over','think','also','back','after','use','two','how','our','way','even','because','any','these','us')
 
 def cleanUpWord(word):
 	word = word.lower()
+	word = lmtzr.lemmatize(word, pos='v')
 	if (len(word) < 2):
 		return None
 	elif (word.isdigit()):