@@ -28,14 +28,13 @@ def decode_page(page_bytes, charset='utf-8'):
2828# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
2929@with_goto
3030def get_page_html (seed_url , header_url , retry_times = 3 , charset = 'utf-8' ):
31- print ("----page html-------" )
31+ # print ("----page html-------")
3232 page_html = None
3333 label .get_page_retry
3434 try :
3535 page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url ), timeout = 10 ).read ()
3636 page_html = decode_page (page_bytes , charset )
3737 except Exception as e :
38- print ("-------exception-----: " ,str (e ))
3938 if ((str (e ) == "timed out" ) or (str (e ) == "<urlopen error timed out>" )):
4039 if (retry_times > 0 ):
4140 print ("urlopen error timed out retry!" )
@@ -220,10 +219,10 @@ def multi_thread_get_html(url_unit, header, queue_num):
220219 print ("get html timed out! append the url list!" )
221220 url_list .append (url_unit )
222221 else :
223- print ("-----------finish get html ----------" )
224- return 0
222+ # print("-----------finish get html ----------")
223+ # return 0
225224 # get html data
226- # multi_thread_collect_data(page_html, queue_num)
225+ multi_thread_collect_data (page_html , queue_num )
227226
228227def sub_sort (array ,low ,high ):
229228 key = array [low ]
@@ -266,15 +265,29 @@ def quick_sort(array,low,high):
266265 t .start ()
267266 for t in GH_threads :
268267 t .join ()
268+
269+ # check if url list is empty
269270 print ("url check list is : " , url_list )
270- '''
271- results = select_data_from_mysql()
272- #draw_data_matplot(results)
273- get_final_url(results)
274- draw_data_echart(results)
271+ retry_list = url_list
272+ RE_threads = []
273+ if retry_list :
274+ for retry_n in retry_list :
275+ url_list .remove (retry_n )
276+ t = Thread (target = multi_thread_get_html , args = (retry_n , header , 9 ))
277+ RE_threads .append (t )
278+ t .start ()
279+ for t in RE_threads :
280+ t .join ()
281+
282+ else :
283+ results = select_data_from_mysql ()
284+ #draw_data_matplot(results)
285+ get_final_url (results )
286+ draw_data_echart (results )
287+ print ("url retry check list is : " , url_list )
275288 end = time ()
276289 print ('cost time is %.3f s ' % (end - start ))
277- '''
290+
278291 '''# -----test-----
279292 #test_url = 'http://www.ireadweek.com/index.php/index/16.html'
280293 #test_url = 'http://pan.baidu.com/s/1qY91y0G'
@@ -290,7 +303,3 @@ def quick_sort(array,low,high):
290303 print (number.split('/')[3].split('.')[0])
291304 print (type(int(number.split('/')[3].split('.')[0])))
292305 '''
293-
294-
295-
296-
0 commit comments