Skip to content

Commit 343e892

Browse files
authored
Merge pull request #54 from Ericyuanhui/test
add timeout add exception handle
2 parents 76c30c5 + 1df7fc0 commit 343e892

File tree

1 file changed

+30
-15
lines changed

1 file changed

+30
-15
lines changed

Language/python/crawl_web/crawl_web.py

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
from time import time
1313
from goto import with_goto
1414

15+
# global value setting
16+
url_list = []
17+
1518
# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
1619
def decode_page(page_bytes, charset='utf-8'):
1720
page_html = None
@@ -27,12 +30,17 @@ def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
2730
print ("----page html-------")
2831
page_html = None
2932
try:
30-
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers = header_url)).read()
33+
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers = header_url), timeout=1).read()
3134
page_html = decode_page(page_bytes, charset)
32-
except URLError:
33-
# logging.error('URL:', error)
34-
if retry_times > 0:
35-
return get_page_html(seed_url, header_url, retry_times=retry_times - 1, charset=charset)
35+
except Exception as e:
36+
if str(e) == "timed out":
37+
if (retry_times > 0):
38+
print ("urlopen error timed out retry!")
39+
return get_page_html(seed_url, header_url, retry_times=retry_times - 1, charset=charset)
40+
else:
41+
return -1
42+
else:
43+
raise Exception('get html exception error!')
3644
return page_html
3745

3846
# 获得页面的编码格式
@@ -187,8 +195,8 @@ def draw_data_echart(select_results):
187195
def get_final_url(select_results):
188196
for final_list in select_results:
189197
final_url = 'http://www.ireadweek.com' + final_list[3]
190-
final_page_encode = get_page_encode(final_url, header)
191-
final_page_html = get_page_html(final_url, header, 3, final_page_encode)
198+
#final_page_encode = get_page_encode(final_url, header)
199+
final_page_html = get_page_html(final_url, header, 3)
192200
final_html_query = pq(final_page_html)
193201
final_link_list = final_html_query('.hanghang-shu-content-btn')
194202
download_link = final_link_list.find('a').attr('href')
@@ -200,11 +208,17 @@ def get_final_url(select_results):
200208

201209
def multi_thread_get_html(url_unit, header, queue_num):
202210
# get page encode
203-
page_encode = get_page_encode(url_unit, header)
211+
#page_encode = get_page_encode(url_unit, header)
204212
# get page html
205-
page_html = get_page_html(url_unit, header, 3, page_encode)
206-
# get html data
207-
multi_thread_collect_data(page_html, queue_num)
213+
page_html = get_page_html(url_unit, header, 3)
214+
if (page_html == -1):
215+
global url_list
216+
print ("get html timed out! append the url list!")
217+
url_list.append(url_unit)
218+
else:
219+
return 0
220+
# get html data
221+
#multi_thread_collect_data(page_html, queue_num)
208222

209223
def sub_sort(array,low,high):
210224
key = array[low]
@@ -247,20 +261,21 @@ def quick_sort(array,low,high):
247261
t.start()
248262
for t in GH_threads:
249263
t.join()
250-
264+
print ("url check list is : ", url_list)
265+
'''
251266
results = select_data_from_mysql()
252267
#draw_data_matplot(results)
253268
get_final_url(results)
254269
draw_data_echart(results)
255270
end = time()
256271
print('cost time is %.3f s ' %(end - start))
257-
272+
'''
258273
'''# -----test-----
259274
#test_url = 'http://www.ireadweek.com/index.php/index/16.html'
260275
#test_url = 'http://pan.baidu.com/s/1qY91y0G'
261276
test_url = 'http://www.ireadweek.com/index.php/index/1.html'
262-
page_encode = get_page_encode(test_url, header)
263-
page_html = get_page_html(test_url, header, 3, page_encode)
277+
#page_encode = get_page_encode(test_url, header)
278+
page_html = get_page_html(test_url, header, 3)
264279
#collect_data(page_html, 9)
265280
html_query = pq(page_html)
266281
link_list = html_query('.action-pagination')

0 commit comments

Comments
 (0)