Skip to content

Commit 2ff1418

Browse files
author
xuyuanhui
committed
test mutil thread
Signed-off-by: xuyuanhui <xuyuanhui@huayun.com>
1 parent 1df7fc0 commit 2ff1418

File tree

1 file changed

+10
-5
lines changed

1 file changed

+10
-5
lines changed

Language/python/crawl_web/crawl_web.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,21 @@ def decode_page(page_bytes, charset='utf-8'):
2626
return page_html
2727

2828
# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
29+
@with_goto
2930
def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
3031
print ("----page html-------")
3132
page_html = None
33+
label .get_page_retry
3234
try:
33-
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers = header_url), timeout=1).read()
35+
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers = header_url), timeout=10).read()
3436
page_html = decode_page(page_bytes, charset)
3537
except Exception as e:
36-
if str(e) == "timed out":
38+
print ("-------exception-----: " ,str(e))
39+
if ((str(e) == "timed out") or (str(e) == "<urlopen error timed out>")):
3740
if (retry_times > 0):
3841
print ("urlopen error timed out retry!")
39-
return get_page_html(seed_url, header_url, retry_times=retry_times - 1, charset=charset)
42+
retry_times = retry_times - 1
43+
goto .get_page_retry
4044
else:
4145
return -1
4246
else:
@@ -136,9 +140,9 @@ def get_url_request_handle(header_url, header):
136140
first_url = header_url + 'index.php/index/1.html'
137141
label .retry
138142
try:
139-
first_html = urllib.request.urlopen(urllib.request.Request(first_url, headers=header), timeout=0.5).read()
143+
first_html = urllib.request.urlopen(urllib.request.Request(first_url, headers=header), timeout=1).read()
140144
except Exception as e:
141-
if str(e) == "timed out":
145+
if ((str(e) == "timed out") or (str(e) == "<urlopen error timed out>")):
142146
if (retry_count > 0):
143147
print ("urlopen error timed out retry!")
144148
retry_count = retry_count - 1
@@ -216,6 +220,7 @@ def multi_thread_get_html(url_unit, header, queue_num):
216220
print ("get html timed out! append the url list!")
217221
url_list.append(url_unit)
218222
else:
223+
print("-----------finish get html ----------")
219224
return 0
220225
# get html data
221226
#multi_thread_collect_data(page_html, queue_num)

0 commit comments

Comments
 (0)