1212from time import time
1313from goto import with_goto
1414
15+ # global value setting
16+ url_list = []
17+
1518# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
1619def decode_page (page_bytes , charset = 'utf-8' ):
1720 page_html = None
@@ -27,12 +30,17 @@ def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
2730 print ("----page html-------" )
2831 page_html = None
2932 try :
30- page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url )).read ()
33+ page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url ), timeout = 1 ).read ()
3134 page_html = decode_page (page_bytes , charset )
32- except URLError :
33- # logging.error('URL:', error)
34- if retry_times > 0 :
35- return get_page_html (seed_url , header_url , retry_times = retry_times - 1 , charset = charset )
35+ except Exception as e :
36+ if str (e ) == "timed out" :
37+ if (retry_times > 0 ):
38+ print ("urlopen error timed out retry!" )
39+ return get_page_html (seed_url , header_url , retry_times = retry_times - 1 , charset = charset )
40+ else :
41+ return - 1
42+ else :
43+ raise Exception ('get html exception error!' )
3644 return page_html
3745
3846# 获得页面的编码格式
@@ -187,8 +195,8 @@ def draw_data_echart(select_results):
187195def get_final_url (select_results ):
188196 for final_list in select_results :
189197 final_url = 'http://www.ireadweek.com' + final_list [3 ]
190- final_page_encode = get_page_encode (final_url , header )
191- final_page_html = get_page_html (final_url , header , 3 , final_page_encode )
198+ # final_page_encode = get_page_encode(final_url, header)
199+ final_page_html = get_page_html (final_url , header , 3 )
192200 final_html_query = pq (final_page_html )
193201 final_link_list = final_html_query ('.hanghang-shu-content-btn' )
194202 download_link = final_link_list .find ('a' ).attr ('href' )
@@ -200,11 +208,17 @@ def get_final_url(select_results):
200208
201209def multi_thread_get_html (url_unit , header , queue_num ):
202210 # get page encode
203- page_encode = get_page_encode (url_unit , header )
211+ # page_encode = get_page_encode(url_unit, header)
204212 # get page html
205- page_html = get_page_html (url_unit , header , 3 , page_encode )
206- # get html data
207- multi_thread_collect_data (page_html , queue_num )
213+ page_html = get_page_html (url_unit , header , 3 )
214+ if (page_html == - 1 ):
215+ global url_list
216+ print ("get html timed out! append the url list!" )
217+ url_list .append (url_unit )
218+ else :
219+ return 0
220+ # get html data
221+ #multi_thread_collect_data(page_html, queue_num)
208222
209223def sub_sort (array ,low ,high ):
210224 key = array [low ]
@@ -247,20 +261,21 @@ def quick_sort(array,low,high):
247261 t .start ()
248262 for t in GH_threads :
249263 t .join ()
250-
264+ print ("url check list is : " , url_list )
265+ '''
251266 results = select_data_from_mysql()
252267 #draw_data_matplot(results)
253268 get_final_url(results)
254269 draw_data_echart(results)
255270 end = time()
256271 print('cost time is %.3f s ' %(end - start))
257-
272+ '''
258273 '''# -----test-----
259274 #test_url = 'http://www.ireadweek.com/index.php/index/16.html'
260275 #test_url = 'http://pan.baidu.com/s/1qY91y0G'
261276 test_url = 'http://www.ireadweek.com/index.php/index/1.html'
262- page_encode = get_page_encode(test_url, header)
263- page_html = get_page_html(test_url, header, 3, page_encode )
277+ # page_encode = get_page_encode(test_url, header)
278+ page_html = get_page_html(test_url, header, 3)
264279 #collect_data(page_html, 9)
265280 html_query = pq(page_html)
266281 link_list = html_query('.action-pagination')
0 commit comments