@@ -45,53 +45,63 @@ def main():
4545 keywords = [x .lower () for x in keywords ]
4646 # make keywords dictionary with zero frequency as value
4747 all_keywords = dict ((strip_weights (el )[0 ], 0 ) for el in keywords )
48+ all_keywords_dict = Counter (all_keywords )
49+
50+ sorted_keywords_list = sorted (all_keywords_dict .items ())
51+
52+ # extract a sorted list of keywords to write as CSV headers
53+ headers = [str (x ) for x , y in sorted_keywords_list ]
54+ # prepend url header onto the keywords list
55+ headers .insert (0 , u'url' )
56+ headers .insert (1 , u'frequency_sum' )
4857
4958 pbar = tqdm (total = len (all_txt_files ))
5059 tqdm .write ("Found {} files to search. Please wait." .
5160 format (len (all_txt_files )))
52- for idx , txt_file in enumerate (all_txt_files ):
53- with open (txt_file ) as fp :
54- visible_text_list = fp .readlines ()
55- current_url = visible_text_list [0 ].strip ().rstrip ()
56- num_digits = len (str (len (all_txt_files )))
57- tqdm .write ("{0:0{width}d}) Done! {1}" .
58- format (idx + 1 , current_url , width = num_digits ))
59-
60- visible_text_list = [x .lower () for x in visible_text_list ]
61-
62- # counts keywords in page
63- found_count , found_keywords = count_keywords (
64- visible_text_list ,
65- keywords
66- )
67- found_keywords_as_dict = dict ((x , y ) for x , y in found_keywords )
68-
69- found_keywords_freq_dict = Counter (found_keywords_as_dict )
70-
71- all_keywords_dict = Counter (all_keywords )
72- # combine both dicts to have uniform dictionary for all pages
73- all_keywords_dict .update (found_keywords_freq_dict )
74- # after merging, sort the resulting dictionary based on keys to
75- # make a tuples list that is always uniform for every page
76- sorted_keywords_list = sorted (all_keywords_dict .items ())
77-
78- # create a sorted dictionary list
79- final_csv_dict = []
80- final_csv_dict .append ({x : y for x , y in sorted_keywords_list })
81-
82- # extract a sorted list of keywords to write as CSV headers
83- headers = [str (x ) for x , y in sorted_keywords_list ]
84- # prepend url header onto the keywords list
85- headers .insert (0 , u'url' )
86- headers .insert (1 , u'frequency_sum' )
87- # logger.info(headers)
88-
89- # prepend the current URL onto the frequencies dict object
90- final_csv_dict [0 ]['frequency_sum' ] = sum (final_csv_dict [0 ].values ())
91- final_csv_dict [0 ]['url' ] = current_url
92-
93- write_csv (csv_file_name , headers , final_csv_dict )
94- pbar .update (1 )
61+
62+ with open (csv_file_name , 'a+' , encoding = "utf-8" ) as f :
63+ # Using dictionary keys as fieldnames for the CSV file header
64+ writer = csv .DictWriter (f , headers )
65+ writer .writeheader ()
66+
67+ for idx , txt_file in enumerate (all_txt_files ):
68+ with open (txt_file ) as fp :
69+ visible_text_list = fp .readlines ()
70+ current_url = visible_text_list [0 ].strip ().rstrip ()
71+ num_digits = len (str (len (all_txt_files )))
72+ tqdm .write ("{0:0{width}d}) Done! {1}" .
73+ format (idx + 1 , current_url , width = num_digits ))
74+
75+ visible_text_list = [x .lower () for x in visible_text_list ]
76+
77+ # counts keywords in page
78+ found_count , found_keywords = count_keywords (
79+ visible_text_list ,
80+ keywords
81+ )
82+ found_keywords_as_dict = dict ((x , y ) for x , y in found_keywords )
83+
84+ found_keywords_freq_dict = Counter (found_keywords_as_dict )
85+
86+ all_keywords_dict = Counter (all_keywords )
87+ # combine both dicts to have uniform dictionary for all pages
88+ all_keywords_dict .update (found_keywords_freq_dict )
89+ # after merging, sort the resulting dictionary based on keys to
90+ # make a tuples list that is always uniform for every page
91+ sorted_keywords_list = sorted (all_keywords_dict .items ())
92+
93+ # create a sorted dictionary list
94+ final_csv_dict = []
95+ final_csv_dict .append ({x : y for x , y in sorted_keywords_list })
96+
97+ # prepend the current URL onto the frequencies dict object
98+ final_csv_dict [0 ]['frequency_sum' ] = sum (final_csv_dict [0 ].values ())
99+ final_csv_dict [0 ]['url' ] = current_url
100+
101+ for d in final_csv_dict :
102+ writer .writerow (d )
103+
104+ pbar .update (1 )
95105
96106 pbar .close ()
97107 sort_csv (csv_file_name , sorted_csv_file_name )
@@ -153,32 +163,6 @@ def count_keywords(list_of_tokens, list_of_target_words):
153163 return num_target_words , matched_words # Note that we are returning a tuple (2 values)
154164
155165
156- def write_csv (output_file , keywords_header , keywords_x_freqs ):
157- """Write a CSV file in the format url, <keyword1>, <keyword2>, <keyword3>, ...
158- output_file - the name of created CSV file
159- keywords_header - list with all the keywords to create header row of CSV
160- keywords_x_freqs - dictionary list with keywords and frequencies
161- return boolean
162- """
163- try :
164- if os .path .exists (output_file ):
165- append_write = 'a' # append if already exists
166- else :
167- append_write = 'w' # make a new file if not
168-
169- with open (output_file , append_write , encoding = "utf-8" ) as f :
170- # Using dictionary keys as fieldnames for the CSV file header
171- writer = csv .DictWriter (f , keywords_header )
172- if append_write == 'w' :
173- writer .writeheader ()
174-
175- for d in keywords_x_freqs :
176- writer .writerow (d )
177- return True
178- except Exception as e :
179- logger .error ('Something bad happend while writing CSV:' + str (e ))
180- return False
181-
182166if __name__ == "__main__" :
183167
184168 parser = argparse .ArgumentParser (
0 commit comments