Skip to content

Commit 51260f5

Browse files
committed
Skip Writing Zero Frequency Pages on CSV
Add logger file for search script.
1 parent 2589350 commit 51260f5

File tree

2 files changed

+19
-2
lines changed

2 files changed

+19
-2
lines changed

Search-Engine-and-Crawler/Crawler/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ machine.
2828

2929
#### Sample usage
3030
```
31-
python crawlerExpand.py [URL] 10 50 lefolder
31+
python crawlerExpand.py [URL] 10 50 myuni
3232
```
3333

3434
### search.py

Search-Engine-and-Crawler/Crawler/search.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,19 @@
66
import logging
77
import os
88
import glob
9+
import time
910
from collections import Counter
1011
import pandas as pd
1112
from tqdm import tqdm
1213

1314

1415
logger = logging.getLogger(__name__)
1516

17+
# current time, used in the names of the folder and the logging file
18+
curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
19+
# Create a new log file
20+
logging.basicConfig(filename=('_unisearchlog_' + curtime + '.log'),level=logging.DEBUG)
21+
1622

1723
def main():
1824

@@ -63,43 +69,54 @@ def main():
6369
# Using dictionary keys as fieldnames for the CSV file header
6470
writer = csv.DictWriter(f, headers)
6571
writer.writeheader()
72+
logger.info("CSV headers written")
6673

6774
for idx, txt_file in enumerate(all_txt_files):
6875
with open(txt_file) as fp:
6976
visible_text_list = fp.readlines()
7077
current_url = visible_text_list[0].strip().rstrip()
7178
num_digits = len(str(len(all_txt_files)))
72-
tqdm.write("{0:0{width}d}) Done! {1}".
79+
tqdm.write("[{0:0{width}d}] {1}".
7380
format(idx+1, current_url, width=num_digits))
7481

82+
logger.info("Working on: {}".format(current_url))
7583
visible_text_list = [x.lower() for x in visible_text_list]
7684

7785
# counts keywords in page
7886
found_count, found_keywords = count_keywords(
7987
visible_text_list,
8088
keywords
8189
)
90+
logger.info("Keywords found: {}".format(found_count))
8291
found_keywords_as_dict = dict((x, y) for x, y in found_keywords)
8392

8493
found_keywords_freq_dict = Counter(found_keywords_as_dict)
8594

8695
all_keywords_dict = Counter(all_keywords)
8796
# combine both dicts to have uniform dictionary for all pages
8897
all_keywords_dict.update(found_keywords_freq_dict)
98+
logger.info("Keywords search results merged!")
8999
# after merging, sort the resulting dictionary based on keys to
90100
# make a tuples list that is always uniform for every page
91101
sorted_keywords_list = sorted(all_keywords_dict.items())
92102

93103
# create a sorted dictionary list
94104
final_csv_dict = []
95105
final_csv_dict.append({x: y for x, y in sorted_keywords_list})
106+
logger.info("Final dictionary appended!")
96107

97108
# prepend the current URL onto the frequencies dict object
98109
final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values())
99110
final_csv_dict[0]['url'] = current_url
100111

112+
# ignore zero frequency_sum...
113+
if final_csv_dict[0]['frequency_sum'] == 0:
114+
pbar.update(1)
115+
continue
116+
101117
for d in final_csv_dict:
102118
writer.writerow(d)
119+
logger.info("Row written successfully!")
103120

104121
pbar.update(1)
105122

0 commit comments

Comments
 (0)