55import re
66import urllib .parse
77import logging
8+ import logging .handlers
89import os
910import time
1011import codecs
2425
2526# update_tld_names() https://stackoverflow.com/a/22228140
2627logger = logging .getLogger (__name__ )
27-
28- # current time, used in the names of the folder and the logging file
29- curtime = time .strftime ("%Y-%m-%d-%H-%M-%S" , time .gmtime ())
30- # Create a new log file
31- logging .basicConfig (filename = ('_uniscraperlog_' + curtime + '.log' ),
32- level = logging .DEBUG
33- )
28+ logger .setLevel (logging .DEBUG )
3429
3530# https://github.com/tqdm/tqdm/issues/481
3631tqdm .monitor_interval = 0
@@ -67,14 +62,34 @@ def main():
6762 with ChDir (batch_website ):
6863 start_page = get_start_page ()
6964
65+ setup_rotating_log (batch_website , seed )
66+
7067 with ChDir (batch_website ):
7168 crawl (seed , pbar [idx ], start_page , planned_urls_array , crawled_urls_array , website , max_pages )
7269 overall_prog .update (1 )
7370
71+ def setup_rotating_log (batch_website , seed ):
72+ with ChDir (batch_website ):
73+ # current time, used in the names of the folder and the logging file
74+ curtime = time .strftime ("%Y-%m-%d-%H-%M-%S" , time .gmtime ())
75+ logs_dir = "logs"
76+ if not os .path .exists (logs_dir ):
77+ os .mkdir (logs_dir )
78+
79+ log_file_name = '_uniscraperlog_{}_{}.log' .format (seed , curtime )
80+ path_to_log_file = os .path .join (logs_dir , log_file_name )
81+ # add a rotating logfile handler
82+ handler = logging .handlers .RotatingFileHandler (
83+ path_to_log_file ,
84+ maxBytes = 2097152 , # 2 MB
85+ backupCount = 100
86+ )
87+ logger .addHandler (handler )
88+
7489def crawl (seed , prog_upd , start_page , planned_urls_array , crawled_urls_array , website , max_pages ):
7590 """Function that takes link, saves the contents to text file call href_split
7691 """
77- logging .info ("Crawling through domain '" + seed + "'" )
92+ logger .info ("Crawling through domain '" + seed + "'" )
7893 tqdm .write ("++++++++++Crawling through domain {}+++++++++++" .format (seed ))
7994 visited_urls , planned_urls , crawled_urls = setup_crawler_files ()
8095
@@ -166,20 +181,20 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_
166181 # Close the pipe to the file
167182 fo .close ()
168183 # Log the creation of the file
169- logging .info ('Created file ' + name )
184+ logger .info ('Created file ' + name )
170185
171186 except KeyboardInterrupt :
172187 tqdm .write ("Script interrupted by user. Shutting down." )
173- logging .info ("Script interrupted by user" )
188+ logger .info ("Script interrupted by user" )
174189 shut_down ()
175190 except Exception :
176- logging .exception ("Can not encode file: " + current_url )
191+ logger .exception ("Can not encode file: " + current_url )
177192 else :
178193 tqdm .write ("No visible text in {}" .format (link ))
179- logging .warning ('No visible text in ' + link )
194+ logger .warning ('No visible text in ' + link )
180195 # Else: html does not exist or is empty. Log error
181196 else :
182- logging .warning ('Request for ' + link + ' returned empty html' )
197+ logger .warning ('Request for ' + link + ' returned empty html' )
183198 empty_request_log .write (link )
184199 empty_request_log .write ("\n " )
185200
@@ -194,7 +209,7 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_
194209 if page % 50 == 0 :
195210 size_of_directory = get_tree_size (os .curdir ) / 1000000
196211 tqdm .write ("Size: {} MB" .format (str (round (size_of_directory , 5 ))))
197- logging .info ("Size: " + str (round (size_of_directory , 5 )) + "MB" )
212+ logger .info ("Size: " + str (round (size_of_directory , 5 )) + "MB" )
198213 # Time delay in seconds to prevent crashing the server
199214 time .sleep (.01 )
200215 return page
@@ -261,7 +276,7 @@ def process_links_from_html(html, prog_upd, cur_link, seed, crawled_urls_array,
261276 crawled_urls .write ("https://" + http_split [1 ])
262277 crawled_urls .write ("\n " )
263278 except IndexError as e :
264- logging .info (str (e ))
279+ logger .info (str (e ))
265280
266281 return
267282
@@ -288,7 +303,7 @@ def add_to_crawled_urls_list(new_link, crawled_urls_array, crawled_urls):
288303 crawled_urls .write ("https://" + http_split [1 ])
289304 crawled_urls .write ("\n " )
290305 except IndexError as e :
291- logging .info (str (e ))
306+ logger .info (str (e ))
292307
293308def add_to_planned_urls_list (new_link , planned_urls_array , planned_urls ):
294309 # Adds new link to array
@@ -322,10 +337,10 @@ def create_name_from_html (html):
322337 if name :
323338 # removes invalid characters from title
324339 name = format_filename (name ) + '__' + str (time .time ())
325- logging .info ('Created name ' + name )
340+ logger .info ('Created name ' + name )
326341 else :
327342 name = "no_title_" + str (time .time ()) # if no title provided give a no title with a timestamp
328- logging .warn ('Failed to create a name, using \' ' + name + '\' instead' )
343+ logger .warn ('Failed to create a name, using \' ' + name + '\' instead' )
329344 return name
330345
331346def format_filename (name ):
@@ -341,7 +356,7 @@ def format_filename(name):
341356 filename = filename .replace (' ' ,'_' )
342357 except TypeError as e :
343358 filename = str (uuid .uuid4 ())
344- logging .error ("Got and error: {}" .format (str (e )))
359+ logger .error ("Got and error: {}" .format (str (e )))
345360 return filename
346361
347362def is_relevant_link_from_html (link ):
@@ -395,7 +410,7 @@ def request_url(url, visited_urls):
395410 )
396411
397412 # Log that this URL is being saved
398- logging .info ('Requesting ' + url )
413+ logger .info ('Requesting ' + url )
399414 visited_urls .write (url )
400415 visited_urls .write ("\n " )
401416 # Use requests module to get html from url as an object
@@ -409,19 +424,25 @@ def request_url(url, visited_urls):
409424 except requests .exceptions .Timeout :
410425 # Maybe set up for a retry, or continue in a retry loop
411426 print ("\n Took too long to get the page." )
412- logging .info ("Took too long to get the page." )
427+ logger .info ("Took too long to get the page." )
413428 except requests .exceptions .RequestException as e :
414429 # catastrophic error. bail.
415430 print ("\n Cannot get the page." )
416- logging .info ("Cannot get the page." )
431+ logger .info ("Cannot get the page." )
417432 except KeyboardInterrupt :
418433 print ("\n \n Script interrupted by user. Shutting down." )
419- logging .info ("Script interrupted by user" )
434+ logger .info ("Script interrupted by user" )
420435 shut_down ()
421436 except Exception :
422- logging .exception ("Couldn\' t request " + url )
437+ logger .exception ("Couldn\' t request " + url )
423438 return None
424439
440+ def exception (request , exception ):
441+ print ("Problem: {}: {}" .format (request .url , exception ))
442+
443+ def request_urls (urls_list ):
444+ results = grequests .map ((grequests .get (u ) for u in urls_list ), exception_handler = exception , size = 5 )
445+
425446def get_start_page ():
426447 """Open the visited_urls text file and count the number of lines
427448 in it – that's how many pages the script visited
@@ -437,6 +458,7 @@ def get_start_page():
437458class ChDir (object ):
438459 """
439460 Step into a directory context on which to operate on.
461+ https://pythonadventures.wordpress.com/2013/12/15/chdir-a-context-manager-for-switching-working-directories/
440462 """
441463 def __init__ (self , path ):
442464 self .old_dir = os .getcwd ()
0 commit comments