Skip to content

Commit 02d2896

Browse files
committed
Rotate Scraper Logs From Becoming Bulky
The universal scraper log file can get very big, very fast this changes enable log rotation and also move then within the relavant batch website folder.
1 parent 905d0c6 commit 02d2896

File tree

1 file changed

+46
-24
lines changed

1 file changed

+46
-24
lines changed

scraper.py

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import re
66
import urllib.parse
77
import logging
8+
import logging.handlers
89
import os
910
import time
1011
import codecs
@@ -24,13 +25,7 @@
2425

2526
# update_tld_names() https://stackoverflow.com/a/22228140
2627
logger = logging.getLogger(__name__)
27-
28-
# current time, used in the names of the folder and the logging file
29-
curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
30-
# Create a new log file
31-
logging.basicConfig(filename=('_uniscraperlog_' + curtime + '.log'),
32-
level=logging.DEBUG
33-
)
28+
logger.setLevel(logging.DEBUG)
3429

3530
# https://github.com/tqdm/tqdm/issues/481
3631
tqdm.monitor_interval = 0
@@ -67,14 +62,34 @@ def main():
6762
with ChDir(batch_website):
6863
start_page = get_start_page()
6964

65+
setup_rotating_log(batch_website, seed)
66+
7067
with ChDir(batch_website):
7168
crawl(seed, pbar[idx], start_page, planned_urls_array, crawled_urls_array, website, max_pages)
7269
overall_prog.update(1)
7370

71+
def setup_rotating_log(batch_website, seed):
72+
with ChDir(batch_website):
73+
# current time, used in the names of the folder and the logging file
74+
curtime = time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
75+
logs_dir = "logs"
76+
if not os.path.exists(logs_dir):
77+
os.mkdir(logs_dir)
78+
79+
log_file_name = '_uniscraperlog_{}_{}.log'.format(seed, curtime)
80+
path_to_log_file = os.path.join(logs_dir, log_file_name)
81+
# add a rotating logfile handler
82+
handler = logging.handlers.RotatingFileHandler(
83+
path_to_log_file,
84+
maxBytes=2097152, # 2 MB
85+
backupCount=100
86+
)
87+
logger.addHandler(handler)
88+
7489
def crawl(seed, prog_upd, start_page, planned_urls_array, crawled_urls_array, website, max_pages):
7590
"""Function that takes link, saves the contents to text file call href_split
7691
"""
77-
logging.info("Crawling through domain '" + seed + "'")
92+
logger.info("Crawling through domain '" + seed + "'")
7893
tqdm.write("++++++++++Crawling through domain {}+++++++++++".format(seed))
7994
visited_urls, planned_urls, crawled_urls = setup_crawler_files()
8095

@@ -166,20 +181,20 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_
166181
# Close the pipe to the file
167182
fo.close()
168183
# Log the creation of the file
169-
logging.info('Created file ' + name)
184+
logger.info('Created file ' + name)
170185

171186
except KeyboardInterrupt:
172187
tqdm.write("Script interrupted by user. Shutting down.")
173-
logging.info("Script interrupted by user")
188+
logger.info("Script interrupted by user")
174189
shut_down()
175190
except Exception:
176-
logging.exception("Can not encode file: " + current_url)
191+
logger.exception("Can not encode file: " + current_url)
177192
else:
178193
tqdm.write("No visible text in {}".format(link))
179-
logging.warning('No visible text in ' + link)
194+
logger.warning('No visible text in ' + link)
180195
# Else: html does not exist or is empty. Log error
181196
else:
182-
logging.warning('Request for ' + link + ' returned empty html')
197+
logger.warning('Request for ' + link + ' returned empty html')
183198
empty_request_log.write(link)
184199
empty_request_log.write("\n")
185200

@@ -194,7 +209,7 @@ def process_current_link(page, prog_upd, link, seed, visited_urls, crawled_urls_
194209
if page % 50 == 0:
195210
size_of_directory = get_tree_size(os.curdir) / 1000000
196211
tqdm.write("Size: {} MB".format(str(round(size_of_directory, 5))))
197-
logging.info("Size: " + str(round(size_of_directory, 5)) + "MB")
212+
logger.info("Size: " + str(round(size_of_directory, 5)) + "MB")
198213
# Time delay in seconds to prevent crashing the server
199214
time.sleep(.01)
200215
return page
@@ -261,7 +276,7 @@ def process_links_from_html(html, prog_upd, cur_link, seed, crawled_urls_array,
261276
crawled_urls.write("https://" + http_split[1])
262277
crawled_urls.write("\n")
263278
except IndexError as e:
264-
logging.info(str(e))
279+
logger.info(str(e))
265280

266281
return
267282

@@ -288,7 +303,7 @@ def add_to_crawled_urls_list(new_link, crawled_urls_array, crawled_urls):
288303
crawled_urls.write("https://" + http_split[1])
289304
crawled_urls.write("\n")
290305
except IndexError as e:
291-
logging.info(str(e))
306+
logger.info(str(e))
292307

293308
def add_to_planned_urls_list(new_link, planned_urls_array, planned_urls):
294309
# Adds new link to array
@@ -322,10 +337,10 @@ def create_name_from_html (html):
322337
if name:
323338
# removes invalid characters from title
324339
name = format_filename(name) + '__' + str(time.time())
325-
logging.info('Created name ' + name)
340+
logger.info('Created name ' + name)
326341
else:
327342
name = "no_title_" + str(time.time()) # if no title provided give a no title with a timestamp
328-
logging.warn('Failed to create a name, using \'' + name + '\' instead')
343+
logger.warn('Failed to create a name, using \'' + name + '\' instead')
329344
return name
330345

331346
def format_filename(name):
@@ -341,7 +356,7 @@ def format_filename(name):
341356
filename = filename.replace(' ','_')
342357
except TypeError as e:
343358
filename = str(uuid.uuid4())
344-
logging.error("Got and error: {}".format(str(e)))
359+
logger.error("Got and error: {}".format(str(e)))
345360
return filename
346361

347362
def is_relevant_link_from_html(link):
@@ -395,7 +410,7 @@ def request_url(url, visited_urls):
395410
)
396411

397412
# Log that this URL is being saved
398-
logging.info('Requesting ' + url)
413+
logger.info('Requesting ' + url)
399414
visited_urls.write(url)
400415
visited_urls.write("\n")
401416
# Use requests module to get html from url as an object
@@ -409,19 +424,25 @@ def request_url(url, visited_urls):
409424
except requests.exceptions.Timeout:
410425
# Maybe set up for a retry, or continue in a retry loop
411426
print("\nTook too long to get the page.")
412-
logging.info("Took too long to get the page.")
427+
logger.info("Took too long to get the page.")
413428
except requests.exceptions.RequestException as e:
414429
# catastrophic error. bail.
415430
print("\nCannot get the page.")
416-
logging.info("Cannot get the page.")
431+
logger.info("Cannot get the page.")
417432
except KeyboardInterrupt:
418433
print("\n\nScript interrupted by user. Shutting down.")
419-
logging.info("Script interrupted by user")
434+
logger.info("Script interrupted by user")
420435
shut_down()
421436
except Exception:
422-
logging.exception("Couldn\'t request " + url)
437+
logger.exception("Couldn\'t request " + url)
423438
return None
424439

440+
def exception(request, exception):
441+
print("Problem: {}: {}".format(request.url, exception))
442+
443+
def request_urls(urls_list):
444+
results = grequests.map((grequests.get(u) for u in urls_list), exception_handler=exception, size=5)
445+
425446
def get_start_page():
426447
"""Open the visited_urls text file and count the number of lines
427448
in it – that's how many pages the script visited
@@ -437,6 +458,7 @@ def get_start_page():
437458
class ChDir(object):
438459
"""
439460
Step into a directory context on which to operate on.
461+
https://pythonadventures.wordpress.com/2013/12/15/chdir-a-context-manager-for-switching-working-directories/
440462
"""
441463
def __init__(self, path):
442464
self.old_dir = os.getcwd()

0 commit comments

Comments
 (0)