-
Notifications
You must be signed in to change notification settings - Fork 0
/
vsm_indexer.py
117 lines (93 loc) · 3.47 KB
/
vsm_indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -----------------------------------------------------------
# This module creates an vsm index from the documents.
#
# The steps performed are summarized below:
# (1) Read all the documents.
# (2) Store Document IDs against their file names.
# (3) Create tokens from each document.
# (4) Preprocess (case-fold, stop-words removal, and lemmatize) tokens.
# (5) Create a vsm matrix with tf-idf as weighting scheme.
# (6) Store the vsm_index as a binary file.
#
#
# (C) 2020 Muhammad Bilal Akmal, 17K-3669
# -----------------------------------------------------------
import time
import nltk
import numpy as np
import filing
from preprocessing.token_normalizer import normalize_tokens
from preprocessing.tokenizer import tokenize
# download wordnet if not found
nltk.download('wordnet')
def _build_vsm_index(documents, stop_words):
'''
Build a vsm weighted matrix of features extracted from `documents`.
`stop_words` are not considered as features.
'''
ndocs = len(documents)
# each row is a doc | each column is a term (feature)
vsm_matrix = np.zeros(shape=(ndocs, 0), dtype=float)
# maps terms to their positions in the vsm_matrix
term_positions = {}
for item in documents.items():
doc_id, document = item
tokens = tokenize(document)
terms = normalize_tokens(tokens, stop_words)
for term in terms:
if term not in term_positions.keys():
# if term is not a dimension, add a dimension to the matrix
position = len(term_positions)
term_positions[term] = position
term_frequencies = np.zeros(shape=(ndocs, 1), dtype=float)
vsm_matrix = np.append(vsm_matrix, term_frequencies, axis=1)
else:
position = term_positions[term]
# increment term frequency
vsm_matrix[doc_id][position] += 1
# document frequency for each term (column of vsm matrix)
document_frequencies = np.count_nonzero(vsm_matrix, axis=0)
# idf = log(N/df)
idf_vector = np.log(ndocs/document_frequencies)
# TF = log(tf+1)
vsm_matrix = vsm_matrix + 1
vsm_matrix = np.log(vsm_matrix)
# multiply IDF with TF
vsm_matrix = np.multiply(vsm_matrix, idf_vector)
# convert each row to unit vector
# |V| = (x^2 + y^2 + ...)^1/2
magnitudes = (vsm_matrix**2).sum(
axis=1, keepdims=True
) ** (0.5)
vsm_matrix = vsm_matrix/magnitudes
# pack positions, matrix, and idf in a single object for storing
vsm_index = {
'term_positions': term_positions,
'vsm_matrix': vsm_matrix,
'idf_vector': idf_vector
}
return vsm_index
def generate_index_file():
'''
Create index file from the corpus.
'''
# Read all the document files
pathname = r'resources\corpus\*.txt'
doc_ids, documents = filing.read_docs_files(pathname)
# Store doc_ids->filename dictionary
filename = r'resources\doc_ids'
filing.store_python_object(doc_ids, filename)
# Read the stop words
filename = r'resources\stopwords.txt'
stop_words = filing.read_stop_words(filename)
# Create the inverted index
vsm_index = _build_vsm_index(documents, stop_words)
# Store the inverted index
filename = r'resources\vsm_index'
filing.store_python_object(vsm_index, filename)
if __name__ == '__main__':
start = time.time()
generate_index_file()
stop = time.time()
print('VSM Index stored to disk.')
print(f'Time: {stop-start}')