Skip to content

Commit

Permalink
Merge pull request floodsung#65 from shreyaspadhy/master
Browse files Browse the repository at this point in the history
Fixed download.py (Empty file downloads, encoding errors)
  • Loading branch information
Flood Sung authored Jul 11, 2017
2 parents 063b7f4 + a44ce12 commit 47a7e9a
Showing 1 changed file with 17 additions and 6 deletions.
23 changes: 17 additions & 6 deletions download.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,27 @@
import re
from six.moves.urllib.request import urlopen
from six.moves.urllib.error import HTTPError
import urllib2
import shutil
import argparse
import mistune
import bs4 as BeautifulSoup
import socket
import time
import requests

# encoding=utf8
import sys

reload(sys)
sys.setdefaultencoding('utf8')

def download_pdf(link, location, name):
try:
response = urlopen(link, timeout=500)
file = open(os.path.join(location, name), 'w')
file.write(response.read())
file.close()
response = requests.get(link)
with open(os.path.join(location, name), 'wb') as f:
f.write(response.content)
f.close()
except HTTPError:
print('>>> Error 404: cannot be downloaded!\n')
raise
Expand All @@ -28,9 +36,11 @@ def clean_pdf_link(link):
link = link.replace('abs', 'pdf')
if not(link.endswith('.pdf')):
link = '.'.join((link, 'pdf'))

print(link)
return link

def clean_text(text, replacements = {' ': '_', '/': '_', '.': '', '"': ''}):
def clean_text(text, replacements = {':': '_', ' ': '_', '/': '_', '.': '', '"': ''}):
for key, rep in replacements.items():
text = text.replace(key, rep)
return text
Expand Down Expand Up @@ -95,13 +105,14 @@ def shorten_title(title):
if link is not None:
link = clean_pdf_link(link.attrs['href'])
ext = get_extension(link)
print(ext)
if not ext in forbidden_extensions:
print(shorten_title(point.text) + ' (' + link + ')')
try:
name = clean_text(point.text.split('[' + ext + ']')[0])
fullname = '.'.join((name, ext))
if not os.path.exists('/'.join((current_directory, fullname)) ):
download_pdf(link, current_directory, '.'.join((name, ext)))
download_pdf(link, current_directory, '.'.join((name, ext)))
except KeyboardInterrupt:
try:
print("Press Ctrl-C in 1 second to quit")
Expand Down

0 comments on commit 47a7e9a

Please sign in to comment.