-
Notifications
You must be signed in to change notification settings - Fork 0
/
INFO 4105 - MACE Simple Search Engine.py
267 lines (234 loc) · 13.4 KB
/
INFO 4105 - MACE Simple Search Engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#██████████████████████████████████████████████████████████#
#█ █#
#█ ██████ ██████ █████████ █████████ ██████████ █#
#█░░██████ ██████ ███░░░░░███ ███░░░░░███░░███░░░░░█ █#
#█ ░███░█████░███ ░███ ░███ ███ ░░░ ░███ █ ░ █#
#█ ░███░░███ ░███ ░███████████ ░███ ░██████ █#
#█ ░███ ░░░ ░███ ░███░░░░░███ ░███ ░███░░█ █#
#█ ░███ ░███ ░███ ░███ ░░███ ███ ░███ ░ █ █#
#█ █████ █████ █████ █████ ░░█████████ ██████████ █#
#█ ░░░░░ ░░░░░ ░░░░░ ░░░░░ ░░░░░░░░░ ░░░░░░░░░░ █#
#█ █#
#█ - - - - - - - - SIMPLE SEARCH ENGINE - - - - - - - - - █#
#██████████████████████████████████████████████████████████#
# #
# Developed by: Allen Cedric Domingo #
# https://github.com/cedricdomingo #
# #
# Last Updated: Nov 30, 2022 #
# #
# =========================================================#
# A project created for Professor Bojiang Ma #
# Kwantlen Polytechnic University, Fall 2022 Semester #
# INFO 4105: Search Engine Principles (S10) #
# =========================================================#
import nltk # Used for tokenizer (word_tokenize) and stemmer (PorterStemmer) later on
import time # Used to calculate indexing and search times
from math import log, sqrt # Used for calculating TF-IDF later on
from builtins import str # Used to convert values to string for printing to screen
from collections import \
defaultdict # We used the defaultdict subclass to avoid having to create a new empty dictionary entry everytime
# a new word appears
total_document_count = 14577 # Files located in Files/ folder
inv_index = defaultdict(list) # Returns empty list whenever non-existent element is accessed.
all_document_vectors = [] # Each element of list is a dictionary, there will exist a vector for each document.
document_frequency = {}
# Prints our banner at execution
def print_banner():
print("███████████████████████████████████████████████████████████\n")
print(" ██████ ██████ █████████ █████████ ██████████")
print(" ░░██████ ██████ ███░░░░░███ ███░░░░░███░░███░░░░░█")
print(" ░███░█████░███ ░███ ░███ ███ ░░░ ░███ █ ░ ")
print(" ░███░░███ ░███ ░███████████ ░███ ░██████ ")
print(" ░███ ░░░ ░███ ░███░░░░░███ ░███ ░███░░█ ")
print(" ░███ ░███ ░███ ░███ ░░███ ███ ░███ ░ █")
print(" █████ █████ █████ █████ ░░█████████ ██████████")
print(" ░░░░░ ░░░░░ ░░░░░ ░░░░░ ░░░░░░░░░ ░░░░░░░░░░ ")
print("\n - - - - - - - - SIMPLE SEARCH ENGINE - - - - - - - - - -")
print("\n███████████████████████████████████████████████████████████")
print("\n Developed by: Allen Cedric Domingo ")
print(" https://github.com/cedricdomingo ")
print("\n Last Updated: Nov 30, 2022 ")
print("\n========================================================")
print(" A project created for Professor Bojiang Ma")
print(" Kwantlen Polytechnic University, Fall 2022 Semester")
print(" INFO 4105: Search Engine Principles (S10)")
print("========================================================")
# Adds token-frequency dicts as list elements to the list called all_document_vectors.
def read_all_documents():
for document_id in range(total_document_count - 1):
document_text = document_string(document_id)
token_list = stem_and_tokenize(document_text)
token_frequency_vector = create_vector(token_list)
all_document_vectors.append(token_frequency_vector)
# Creates token-frequency dictionary from the input query.
def input_vector(query):
token_frequency_vector = {}
for word in query:
if word in token_frequency_vector:
token_frequency_vector[
word] += 1.0 # These values are floats since they will be converted to TF-IDF later. - ACD
else:
token_frequency_vector[word] = 1.0
return token_frequency_vector
# Generates inverted index for all documents.
def inv_index_all_documents():
count = 0
for document_vector in all_document_vectors:
for word in document_vector:
inv_index[word].append(count) # Here defaultdict shows its value, returns 0.
count += 1
# Changes all token-frequency vectors to TF-IDF vectors.
def tf_idf_documents():
length = 0.0
for document_vector in all_document_vectors:
for word in document_vector:
frequency = document_vector[word]
score = tf_idf_weighting(word, frequency)
document_vector[word] = score
length += score ** 2
length = sqrt(length)
for word in document_vector:
document_vector[word] /= length
# Calculates the TF-IDF vector for the query in specific.
def tf_idf_query(query_vector):
length = 0.0
for word in query_vector:
frequency = query_vector[word]
if word in document_frequency:
query_vector[word] = tf_idf_weighting(word, frequency)
else:
query_vector[word] = log(1 + frequency) * log(total_document_count)
length += query_vector[word] ** 2
length = sqrt(length)
if length != 0:
for word in query_vector:
query_vector[word] /= length
# Calculates TF-IDF weighting, give TF and DF values.
def tf_idf_weighting(word, frequency):
return log(1 + frequency) * log(total_document_count / document_frequency[word])
# Calculates the dot product of two given vectors.
def dot_product(vector_a, vector_b):
if len(vector_a) > len(vector_b): # Swapping to ensure that left dict is always smaller.
temp = vector_a
vector_a = vector_b
vector_b = temp
key_list_a = vector_a.keys()
key_list_b = vector_b.keys()
sum = 0
for key in key_list_a:
if key in key_list_b:
sum += vector_a[key] * vector_b[key]
return sum
# Returns list of string tokens after stemming.
def stem_and_tokenize(document_text):
token_list = nltk.word_tokenize(document_text) # We first use nltk.tokenize to split sentences into tokens
ps = nltk.stem.PorterStemmer() # Then we use nltk.PorterStemmer to shorten words to stem word
result = []
for word in token_list:
result.append(ps.stem(word))
return result
# Creates token-frequency vector from input string.
def create_vector(token_list):
token_frequency_vector = {}
global document_frequency
for token in token_list:
if token in token_frequency_vector:
token_frequency_vector[token] += 1
else:
token_frequency_vector[token] = 1
if token in document_frequency:
document_frequency[token] += 1
else:
document_frequency[token] = 1
return token_frequency_vector
# Reads data from a bunch of files in the Dataset in the folder called Files in the same directory.
def document_string(document_id):
try:
file_text = str(open("Files/" + str(
document_id)).read()) # Reads the filepath in Files/ folder and reads the text as a string
except:
file_text = "" # Skips file if file is blank - ACD
return file_text
# Returns a list of document IDs sorted on basis of cosine similarity.
def query_result(query_vector):
answer = []
for document_id in range(total_document_count - 1):
dp = dot_product(query_vector, all_document_vectors[document_id])
answer.append((document_id, dp))
answer = sorted(answer, key=lambda x: x[1], reverse=True)
return answer
# Execution starts from here.
start_index_time = time.time()
print_banner()
print("\n\n Currently indexing... Please be patient.")
read_all_documents()
inv_index_all_documents()
tf_idf_documents()
print("\n\n\n\n--------------------------------------------------------")
print("Indexing complete! Thank you for waiting.")
print("Indexing took " + str(
time.time() - start_index_time) + " seconds.") # Subtracts current time from time execution begins (stored in
# variable start_index_time) - ACD
print("--------------------------------------------------------")
# Preprocessing ends here.
# Query input to be taken.
while True:
rank = 0
query = input("Enter a search query (Enter 'x' to exit): ")
if query == "x": # Added if statement to show credits and close search engine if 'x' is entered
print("\n\n\n========================================================")
print_banner()
print(" Closing in 10 seconds...")
time.sleep(10) # Added 10s delay before closing program so there is time to view our credits
break
if len(query) == 0:
print(
"No text found, please enter text to search:") # Added if statement to re-query user if no text is
# entered
else:
start_search_time = time.time()
token_list = stem_and_tokenize(query)
query_vector = input_vector(token_list)
tf_idf_query(query_vector)
result = query_result(query_vector)
print("This query took " + str(
time.time() - start_search_time) + " seconds.") # Added print statement to show query time
print("====================================================================")
print(" Viewing results for query: '" + query + "'")
print("====================================================================")
f_result = result[
:10] # Value in result[:] is the limit for results returned. Change 10 to any variable to change
# number of returned results
for element in f_result:
if float(element[1]) == 0.0:
print(
"No results found, please try a different query.") # Prints out statement if first tf-idf score
# = 0.0
break
else:
rank = rank + 1
if float(element[1]) == 0.0: # Stops printing if next tf-idf score = 0.0 (no match)
break
else:
print("Rank:\t" + str(rank) + "\t| Doc ID: " + str(element[0]) + "\t| Weight: " + str(element[1]))
print("====================================================================")
viewResults = input(
"View results? \n(Enter 'y' to view, otherwise press Enter to search again): ") # Added
# function to expand results and read
if viewResults == "y":
rank = 0
for element in f_result:
rank = rank + 1
if float(element[1]) == 0.0: # Stops printing if next tf-idf score = 0.0 (no match)
break
else:
print(
"Rank:\t" + str(rank) + "\t| documentID: " + str(element[0]) + "\t| Weight: " + str(element[1]))
print("=======================================================================================")
print(" Query: '" + query + "' \t| Rank: " + str(rank) + "\t| Doc ID: " + str(
element[0]) + "\t| Weight: " + str(element[1]))
print("---------------------------------------------------------------------------------------")
file_text = str(open("Files/" + str(element[0])).read())
print(file_text)
print("=======================================================================================")