-
Notifications
You must be signed in to change notification settings - Fork 0
/
Docs_class.py
56 lines (56 loc) · 1.9 KB
/
Docs_class.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import nltk
from math import log
nltk.download("book")
nltk.download('omw-1.4')
class Docs():
def __init__(self,*doc_list):
self.doc_list = doc_list
def doc_hote(self, word):
res = []
for doc in self.doc_list:
if word in doc.racinisation():
res.append(doc.name)
return res
def word_frequency(self, word):
res = []
for doc in self.doc_list:
if doc.name in self.doc_hote(word):
all_words = []
for w in doc.racinisation():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
res.append((word,all_words[word],doc.name))
return res
def weight(self, word):
res = []
for doc in self.doc_list:
if doc.name in self.doc_hote(word):
all_words = []
for w in doc.racinisation():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
formule = (1+log(all_words[word]))*log(len(self.doc_list) / len(self.doc_hote(word)))
res.append((word,formule,doc.name))
return res
def tf_idf(self,word):
texts = []
for doc in self.doc_list:
if doc.name in self.doc_hote(word):
texts.append(doc.racinisation())
mytexts = nltk.TextCollection(texts)
tf = []
for t in texts:
for doc in self.doc_list:
if doc.racinisation() == t:
tf.append((mytexts.tf(word, t),doc.name))
continue
return tf
def most_relevant(self, word):
tf_idf = self.tf_idf(word)
max = tf_idf[0][0]
doc = tf_idf[0][1]
for i in range(1,len(tf_idf)):
if tf_idf[i][0]>max:
max = tf_idf[i][0]
doc = tf_idf[i][1]
return doc