Skip to content

Commit 2589350

Browse files
committed
Reduce CSV Writer Overhead for Faster Processing
Now opens the CSV file once for appending instead of opening with each iteration as previously. Writeheader is also done before iteration starts removing condition previously to to check everytime.
1 parent 29bdc2b commit 2589350

File tree

2 files changed

+54
-69
lines changed

2 files changed

+54
-69
lines changed

Search-Engine-and-Crawler/Crawler/keywords_game.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ gaming
1717
game art
1818
game AI
1919
video game
20+
video games|20
2021
computer games
2122
console games
2223
mobile games

Search-Engine-and-Crawler/Crawler/search.py

Lines changed: 53 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -45,53 +45,63 @@ def main():
4545
keywords = [x.lower() for x in keywords]
4646
# make keywords dictionary with zero frequency as value
4747
all_keywords = dict((strip_weights(el)[0], 0) for el in keywords)
48+
all_keywords_dict = Counter(all_keywords)
49+
50+
sorted_keywords_list = sorted(all_keywords_dict.items())
51+
52+
# extract a sorted list of keywords to write as CSV headers
53+
headers = [str(x) for x, y in sorted_keywords_list]
54+
# prepend url header onto the keywords list
55+
headers.insert(0, u'url')
56+
headers.insert(1, u'frequency_sum')
4857

4958
pbar = tqdm(total=len(all_txt_files))
5059
tqdm.write("Found {} files to search. Please wait.".
5160
format(len(all_txt_files)))
52-
for idx, txt_file in enumerate(all_txt_files):
53-
with open(txt_file) as fp:
54-
visible_text_list = fp.readlines()
55-
current_url = visible_text_list[0].strip().rstrip()
56-
num_digits = len(str(len(all_txt_files)))
57-
tqdm.write("{0:0{width}d}) Done! {1}".
58-
format(idx+1, current_url, width=num_digits))
59-
60-
visible_text_list = [x.lower() for x in visible_text_list]
61-
62-
# counts keywords in page
63-
found_count, found_keywords = count_keywords(
64-
visible_text_list,
65-
keywords
66-
)
67-
found_keywords_as_dict = dict((x, y) for x, y in found_keywords)
68-
69-
found_keywords_freq_dict = Counter(found_keywords_as_dict)
70-
71-
all_keywords_dict = Counter(all_keywords)
72-
# combine both dicts to have uniform dictionary for all pages
73-
all_keywords_dict.update(found_keywords_freq_dict)
74-
# after merging, sort the resulting dictionary based on keys to
75-
# make a tuples list that is always uniform for every page
76-
sorted_keywords_list = sorted(all_keywords_dict.items())
77-
78-
# create a sorted dictionary list
79-
final_csv_dict = []
80-
final_csv_dict.append({x: y for x, y in sorted_keywords_list})
81-
82-
# extract a sorted list of keywords to write as CSV headers
83-
headers = [str(x) for x, y in sorted_keywords_list]
84-
# prepend url header onto the keywords list
85-
headers.insert(0, u'url')
86-
headers.insert(1, u'frequency_sum')
87-
# logger.info(headers)
88-
89-
# prepend the current URL onto the frequencies dict object
90-
final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values())
91-
final_csv_dict[0]['url'] = current_url
92-
93-
write_csv(csv_file_name, headers, final_csv_dict)
94-
pbar.update(1)
61+
62+
with open(csv_file_name, 'a+', encoding="utf-8") as f:
63+
# Using dictionary keys as fieldnames for the CSV file header
64+
writer = csv.DictWriter(f, headers)
65+
writer.writeheader()
66+
67+
for idx, txt_file in enumerate(all_txt_files):
68+
with open(txt_file) as fp:
69+
visible_text_list = fp.readlines()
70+
current_url = visible_text_list[0].strip().rstrip()
71+
num_digits = len(str(len(all_txt_files)))
72+
tqdm.write("{0:0{width}d}) Done! {1}".
73+
format(idx+1, current_url, width=num_digits))
74+
75+
visible_text_list = [x.lower() for x in visible_text_list]
76+
77+
# counts keywords in page
78+
found_count, found_keywords = count_keywords(
79+
visible_text_list,
80+
keywords
81+
)
82+
found_keywords_as_dict = dict((x, y) for x, y in found_keywords)
83+
84+
found_keywords_freq_dict = Counter(found_keywords_as_dict)
85+
86+
all_keywords_dict = Counter(all_keywords)
87+
# combine both dicts to have uniform dictionary for all pages
88+
all_keywords_dict.update(found_keywords_freq_dict)
89+
# after merging, sort the resulting dictionary based on keys to
90+
# make a tuples list that is always uniform for every page
91+
sorted_keywords_list = sorted(all_keywords_dict.items())
92+
93+
# create a sorted dictionary list
94+
final_csv_dict = []
95+
final_csv_dict.append({x: y for x, y in sorted_keywords_list})
96+
97+
# prepend the current URL onto the frequencies dict object
98+
final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values())
99+
final_csv_dict[0]['url'] = current_url
100+
101+
for d in final_csv_dict:
102+
writer.writerow(d)
103+
104+
pbar.update(1)
95105

96106
pbar.close()
97107
sort_csv(csv_file_name, sorted_csv_file_name)
@@ -153,32 +163,6 @@ def count_keywords(list_of_tokens, list_of_target_words):
153163
return num_target_words, matched_words # Note that we are returning a tuple (2 values)
154164

155165

156-
def write_csv(output_file, keywords_header, keywords_x_freqs):
157-
"""Write a CSV file in the format url, <keyword1>, <keyword2>, <keyword3>, ...
158-
output_file - the name of created CSV file
159-
keywords_header - list with all the keywords to create header row of CSV
160-
keywords_x_freqs - dictionary list with keywords and frequencies
161-
return boolean
162-
"""
163-
try:
164-
if os.path.exists(output_file):
165-
append_write = 'a' # append if already exists
166-
else:
167-
append_write = 'w' # make a new file if not
168-
169-
with open(output_file, append_write, encoding="utf-8") as f:
170-
# Using dictionary keys as fieldnames for the CSV file header
171-
writer = csv.DictWriter(f, keywords_header)
172-
if append_write == 'w':
173-
writer.writeheader()
174-
175-
for d in keywords_x_freqs:
176-
writer.writerow(d)
177-
return True
178-
except Exception as e:
179-
logger.error('Something bad happend while writing CSV:' + str(e))
180-
return False
181-
182166
if __name__ == "__main__":
183167

184168
parser = argparse.ArgumentParser(

0 commit comments

Comments
 (0)