From 1ef2808c90ce7f904400f10716db3feb14fafe31 Mon Sep 17 00:00:00 2001 From: suicidedamsel Date: Tue, 3 Jan 2017 22:23:20 +0800 Subject: [PATCH] Skip papers already downloaded, and add timeout --- download.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/download.py b/download.py index 79674bc..1679a4c 100644 --- a/download.py +++ b/download.py @@ -8,13 +8,15 @@ def download_pdf(link, location, name): try: - response = urllib2.urlopen(link) + response = urllib2.urlopen(link, timeout=500) file = open(os.path.join(location, name), 'w') file.write(response.read()) file.close() except urllib2.HTTPError: print('>>> Error 404: cannot be downloaded!\n') raise + except socket.timeout: + print(" ".join(("can't download", link, "due to connection timeout!")) ) def clean_pdf_link(link): if 'arxiv' in link: @@ -81,7 +83,8 @@ def shorten_title(title): current_directory = h1_directory elif point.name == 'h2': current_directory = os.path.join(h1_directory, clean_text(point.text)) - os.makedirs(current_directory) + if not os.path.exists(current_directory): + os.makedirs(current_directory) print_title(point.text) if point.name == 'p': @@ -93,7 +96,9 @@ def shorten_title(title): print(shorten_title(point.text) + ' (' + link + ')') try: name = clean_text(point.text.split('[' + ext + ']')[0]) - download_pdf(link, current_directory, '.'.join((name, ext))) + fullname = '.'.join((name, ext)) + if not os.path.exists('/'.join((current_directory, fullname)) ): + download_pdf(link, current_directory, '.'.join((name, ext))) except: failures.append(point.text)