1010from pyquery import PyQuery as pq
1111from threading import Thread
1212from time import time
13+ from goto import with_goto
1314
1415# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
1516def decode_page (page_bytes , charset = 'utf-8' ):
1617 page_html = None
1718 try :
1819 page_html = page_bytes .decode (charset )
19-
2020 except UnicodeDecodeError :
2121 pass
2222 # logging.error('Decode:', error)
2323 return page_html
2424
2525# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
2626def get_page_html (seed_url , header_url , retry_times = 3 , charset = 'utf-8' ):
27+ print ("----page html-------" )
2728 page_html = None
2829 try :
2930 page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url )).read ()
@@ -36,6 +37,7 @@ def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
3637
3738# 获得页面的编码格式
3839def get_page_encode (seed_url , header_url ):
40+ print ("----page encode-------" )
3941 page_encode = None
4042 try :
4143 page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url )).read ()
@@ -89,20 +91,17 @@ def multi_thread_collect_data(page_html, start_index):
8991 final_book_author = re .sub (r'\"+' , ' ' , re_book_author )
9092 if book_name :
9193 query_index = query_index + 1
92- #print("book_name: %s ,book num: %s ,book_author: %s, book link: %s" % (book_name, book_download_num, book_author, href_list))
93-
94+ print ("book_name: %s ,book num: %s ,book_author: %s, book link: %s" % (book_name , book_download_num , book_author , href_list ))
9495 # multi_thread database store all info
9596 t = Thread (target = store_data , args = (final_book_name , final_book_author , book_download_num , href_list ))
9697 DB_threads .append (t )
9798 t .start ()
98- #store_data(final_book_name, final_book_author, book_download_num, href_list)
9999 else :
100100 break
101101 # wait for all DB operation finish
102102 for t in DB_threads :
103103 t .join ()
104104
105-
106105# store data into pymysql
107106def store_data (g_name , g_author , g_downloadcount , g_link ):
108107 db = pymysql .connect (host = "localhost" , user = "root" , password = "123456" , database = "testdb" , charset = "utf8" )
@@ -123,26 +122,27 @@ def store_data(g_name, g_author, g_downloadcount, g_link):
123122 db .commit ()
124123 db .close ()
125124
126- def get_whole_page_url (header_url , header ):
127- list_url = []
128- page_number = 1
129- while (1 ):
130- test_url = header_url + 'index.php/index/' + str (page_number ) + '.html'
131- test_header = header
132- try :
133- test_html = urllib .request .urlopen (urllib .request .Request (test_url , headers = test_header )).read ()
134- test_query = pq (test_html )
135- test_name = pq (test_query ('a' ).eq (7 )).find ('.hanghang-list-name' ).text ()
136- if test_name :
137- page_number = page_number + 1
138- list_url .append (test_url )
139- #print ("list name ", test_name)
140- #time.sleep(2)
125+ @with_goto
126+ def get_url_request_handle (header_url , header ):
127+ retry_count = 3
128+ first_url = header_url + 'index.php/index/1.html'
129+ label .retry
130+ try :
131+ first_html = urllib .request .urlopen (urllib .request .Request (first_url , headers = header ), timeout = 0.5 ).read ()
132+ except Exception as e :
133+ if str (e ) == "timed out" :
134+ if (retry_count > 0 ):
135+ print ("urlopen error timed out retry!" )
136+ retry_count = retry_count - 1
137+ goto .retry
141138 else :
142- break
143- except URLError :
144- break
145- return list_url
139+ raise Exception ('urlopen error timed out more than three times!' )
140+ else :
141+ raise Exception ('exception error!' )
142+ html_query = pq (first_html )
143+ link_list = html_query ('.action-pagination' )
144+ page_number = int (link_list .find ('a' ).eq (6 ).attr ('href' ).split ('/' )[3 ].split ('.' )[0 ])
145+ return page_number
146146
147147# select mysql data to chart
148148def select_data_from_mysql ():
@@ -165,8 +165,6 @@ def draw_data_matplot(select_results):
165165 for select_list in select_results :
166166 list_name .append (select_list [0 ])
167167 list_count .append (int (select_list [2 ]))
168- #print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
169- #plt.plot(list_count, 'bs')
170168 quick_sort (list_count , 0 , len (list_count )- 1 )
171169 for i in list_count :
172170 print ("quick sort: " , i )
@@ -208,7 +206,6 @@ def multi_thread_get_html(url_unit, header, queue_num):
208206 # get html data
209207 multi_thread_collect_data (page_html , queue_num )
210208
211-
212209def sub_sort (array ,low ,high ):
213210 key = array [low ]
214211 while low < high :
@@ -227,36 +224,19 @@ def quick_sort(array,low,high):
227224 quick_sort (array ,low ,key_index )
228225 quick_sort (array ,key_index + 1 ,high )
229226
230-
231227if __name__ == "__main__" :
232- #url = 'https://www.671cf.com/htm/index.htm'
233- #url = 'https://www.gavbus.com/'
228+
234229 start = time ()
235230 url = 'http://www.ireadweek.com/'
236231 header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36' }
237232 ssl ._create_default_https_context = ssl ._create_unverified_context
238233
239- # get all page url list
240- search_list = []
241- search_list = get_whole_page_url (url , header )
234+ # get all page url count
235+ page_count = get_url_request_handle (url , header )
242236 cycle_flag = 0
243237 GH_threads = []
244- '''for url_unit in search_list:
245- print ("---------url-------", url_unit)
246- if cycle_flag:
247- queue_num = 7
248- else:
249- cycle_flag = 1
250- queue_num = 9
251- # get page encode
252- page_encode = get_page_encode(url_unit, header)
253- # get page html
254- page_html = get_page_html(url_unit, header, 3, page_encode)
255- # get html data
256- collect_data(page_html, queue_num)
257- '''
258- for url_unit in search_list :
259- print ("---------url-------" , url_unit )
238+ for n in range (1 , int (page_count )):
239+ url_unit = "http://www.ireadweek.com/index.php/index/" + str (n ) + ".html"
260240 if cycle_flag :
261241 queue_num = 7
262242 else :
@@ -275,19 +255,20 @@ def quick_sort(array,low,high):
275255 end = time ()
276256 print ('cost time is %.3f s ' % (end - start ))
277257
278- '''
279- # test mysql update
258+ '''# -----test-----
280259 #test_url = 'http://www.ireadweek.com/index.php/index/16.html'
281260 #test_url = 'http://pan.baidu.com/s/1qY91y0G'
282- test_url = 'http://www.ireadweek.com/index.php/bookInfo/11043.html'
283-
261+ test_url = 'http://www.ireadweek.com/index.php/index/1.html'
284262 page_encode = get_page_encode(test_url, header)
285263 page_html = get_page_html(test_url, header, 3, page_encode)
286264 #collect_data(page_html, 9)
287265 html_query = pq(page_html)
288- link_list = html_query('.hanghang-shu-content-btn')
289- print (link_list.find('a').attr('href'))
290- print (type(link_list.find('a').attr('href')))
266+ link_list = html_query('.action-pagination')
267+ print (link_list)
268+ print (link_list.find('a').eq(6).attr('href'))
269+ number = link_list.find('a').eq(6).attr('href')
270+ print (number.split('/')[3].split('.')[0])
271+ print (type(int(number.split('/')[3].split('.')[0])))
291272 '''
292273
293274
0 commit comments