|
6 | 6 | import logging |
7 | 7 | import os |
8 | 8 | import glob |
| 9 | +import time |
9 | 10 | from collections import Counter |
10 | 11 | import pandas as pd |
11 | 12 | from tqdm import tqdm |
12 | 13 |
|
13 | 14 |
|
14 | 15 | logger = logging.getLogger(__name__) |
15 | 16 |
|
| 17 | +# current time, used in the names of the folder and the logging file |
| 18 | +curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime()) |
| 19 | +# Create a new log file |
| 20 | +logging.basicConfig(filename=('_unisearchlog_' + curtime + '.log'),level=logging.DEBUG) |
| 21 | + |
16 | 22 |
|
17 | 23 | def main(): |
18 | 24 |
|
@@ -63,43 +69,54 @@ def main(): |
63 | 69 | # Using dictionary keys as fieldnames for the CSV file header |
64 | 70 | writer = csv.DictWriter(f, headers) |
65 | 71 | writer.writeheader() |
| 72 | + logger.info("CSV headers written") |
66 | 73 |
|
67 | 74 | for idx, txt_file in enumerate(all_txt_files): |
68 | 75 | with open(txt_file) as fp: |
69 | 76 | visible_text_list = fp.readlines() |
70 | 77 | current_url = visible_text_list[0].strip().rstrip() |
71 | 78 | num_digits = len(str(len(all_txt_files))) |
72 | | - tqdm.write("{0:0{width}d}) Done! {1}". |
| 79 | + tqdm.write("[{0:0{width}d}] {1}". |
73 | 80 | format(idx+1, current_url, width=num_digits)) |
74 | 81 |
|
| 82 | + logger.info("Working on: {}".format(current_url)) |
75 | 83 | visible_text_list = [x.lower() for x in visible_text_list] |
76 | 84 |
|
77 | 85 | # counts keywords in page |
78 | 86 | found_count, found_keywords = count_keywords( |
79 | 87 | visible_text_list, |
80 | 88 | keywords |
81 | 89 | ) |
| 90 | + logger.info("Keywords found: {}".format(found_count)) |
82 | 91 | found_keywords_as_dict = dict((x, y) for x, y in found_keywords) |
83 | 92 |
|
84 | 93 | found_keywords_freq_dict = Counter(found_keywords_as_dict) |
85 | 94 |
|
86 | 95 | all_keywords_dict = Counter(all_keywords) |
87 | 96 | # combine both dicts to have uniform dictionary for all pages |
88 | 97 | all_keywords_dict.update(found_keywords_freq_dict) |
| 98 | + logger.info("Keywords search results merged!") |
89 | 99 | # after merging, sort the resulting dictionary based on keys to |
90 | 100 | # make a tuples list that is always uniform for every page |
91 | 101 | sorted_keywords_list = sorted(all_keywords_dict.items()) |
92 | 102 |
|
93 | 103 | # create a sorted dictionary list |
94 | 104 | final_csv_dict = [] |
95 | 105 | final_csv_dict.append({x: y for x, y in sorted_keywords_list}) |
| 106 | + logger.info("Final dictionary appended!") |
96 | 107 |
|
97 | 108 | # prepend the current URL onto the frequencies dict object |
98 | 109 | final_csv_dict[0]['frequency_sum'] = sum(final_csv_dict[0].values()) |
99 | 110 | final_csv_dict[0]['url'] = current_url |
100 | 111 |
|
| 112 | + # ignore zero frequency_sum... |
| 113 | + if final_csv_dict[0]['frequency_sum'] == 0: |
| 114 | + pbar.update(1) |
| 115 | + continue |
| 116 | + |
101 | 117 | for d in final_csv_dict: |
102 | 118 | writer.writerow(d) |
| 119 | + logger.info("Row written successfully!") |
103 | 120 |
|
104 | 121 | pbar.update(1) |
105 | 122 |
|
|
0 commit comments