@@ -26,17 +26,21 @@ def decode_page(page_bytes, charset='utf-8'):
2626 return page_html
2727
2828# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
29+ @with_goto
2930def get_page_html (seed_url , header_url , retry_times = 3 , charset = 'utf-8' ):
3031 print ("----page html-------" )
3132 page_html = None
33+ label .get_page_retry
3234 try :
33- page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url ), timeout = 1 ).read ()
35+ page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url ), timeout = 10 ).read ()
3436 page_html = decode_page (page_bytes , charset )
3537 except Exception as e :
36- if str (e ) == "timed out" :
38+ print ("-------exception-----: " ,str (e ))
39+ if ((str (e ) == "timed out" ) or (str (e ) == "<urlopen error timed out>" )):
3740 if (retry_times > 0 ):
3841 print ("urlopen error timed out retry!" )
39- return get_page_html (seed_url , header_url , retry_times = retry_times - 1 , charset = charset )
42+ retry_times = retry_times - 1
43+ goto .get_page_retry
4044 else :
4145 return - 1
4246 else :
@@ -136,9 +140,9 @@ def get_url_request_handle(header_url, header):
136140 first_url = header_url + 'index.php/index/1.html'
137141 label .retry
138142 try :
139- first_html = urllib .request .urlopen (urllib .request .Request (first_url , headers = header ), timeout = 0.5 ).read ()
143+ first_html = urllib .request .urlopen (urllib .request .Request (first_url , headers = header ), timeout = 1 ).read ()
140144 except Exception as e :
141- if str (e ) == "timed out" :
145+ if (( str (e ) == "timed out" ) or ( str ( e ) == "<urlopen error timed out>" )) :
142146 if (retry_count > 0 ):
143147 print ("urlopen error timed out retry!" )
144148 retry_count = retry_count - 1
@@ -216,6 +220,7 @@ def multi_thread_get_html(url_unit, header, queue_num):
216220 print ("get html timed out! append the url list!" )
217221 url_list .append (url_unit )
218222 else :
223+ print ("-----------finish get html ----------" )
219224 return 0
220225 # get html data
221226 #multi_thread_collect_data(page_html, queue_num)
0 commit comments