-
Notifications
You must be signed in to change notification settings - Fork 1
/
vector_model.py
46 lines (41 loc) · 1.29 KB
/
vector_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
from collections import Counter
import np
def df(processed_text) :
DF = {}
for i in range(len(processed_text)):
tokens = processed_text[i]
for w in tokens:
try:
DF[w].add(i)
except:
DF[w] = {i}
for i in DF:
DF[i] = len(DF[i])
return DF
def tf_idf(processed_text,df_map):
tf_idf = {}
N=len(processed_text)
with open("C:\\Users\\HP\\Desktop\\ir\\Lab4\\test.txt", 'w') as fw:
for i in range(N):
tokens = processed_text[i]
counter = Counter(tokens)
words_count=len(np.unique(tokens))
for token in np.unique(tokens):
tf = counter[token]/words_count
df = df_map[token]
idf = np.log(N/(df+1))
tf_idf[i, token] = tf*idf
fw.write(str(i))
fw.write(',')
fw.write(token)
fw.write('-->')
fw.write(str(tf_idf[i, token]))
fw.write("\n")
return(tf_idf)
# Document Vectorization
def build_docs_vectors(n,total_vocab,tf_idf):
docs_vectors = np.zeros((n, len(total_vocab)))
for i in tf_idf:
ind = total_vocab.index(i[1])
docs_vectors[i[0]][ind] = tf_idf[i]
return docs_vectors