Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 23 additions & 48 deletions spider/HtmlDownloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,61 +12,36 @@


class Html_Downloader(object):
@classmethod
def download(self, url):
count = 0 # 重试次数
r = ''
@staticmethod
def download(url):
try:
r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT)
r.encoding = chardet.detect(r.content)['encoding']
if (not r.ok) or len(r.content) < 500:
raise ConnectionError
else:
return r.text

except Exception:
count = 0 # 重试次数
proxylist = sqlhelper.select(10)
if not proxylist:
return None

while count < config.RETRY_TIME:
if (not r.ok) or len(r.content) < 500:
proxylist = sqlhelper.select(10)
try:
proxy = random.choice(proxylist)
ip = proxy[0]
port = proxy[1]
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
try:
r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
r.encoding = chardet.detect(r.content)['encoding']
count += 1
except Exception as e:
count += 1

else:
return r.text

return None

except Exception as e:
while count < config.RETRY_TIME:
if r == '' or (not r.ok) or len(r.content) < 500:
try:
proxylist = sqlhelper.select(10)
proxy = random.choice(proxylist)
ip = proxy[0]
port = proxy[1]
proxies = {"http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port)}
try:
r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
r.encoding = chardet.detect(r.content)['encoding']
count += 1
except Exception as e:
count += 1

except Exception as e:
return None

else:
return r.text

return None








r = requests.get(url=url, headers=config.HEADER, timeout=config.TIMEOUT, proxies=proxies)
r.encoding = chardet.detect(r.content)['encoding']
if (not r.ok) or len(r.content) < 500:
raise ConnectionError
else:
return r.text
except Exception:
count += 1

return None
2 changes: 1 addition & 1 deletion util/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class Test_URL_Fail(Exception):
def __str__(self):
str = "访问%s失败,请检查网络连接" % config.TEST_URL
str = "访问%s失败,请检查网络连接" % config.TEST_IP
return str


Expand Down