55import pymysql
66import re
77import matplotlib .pyplot as plt
8- import time
98
109from pyecharts import Bar
1110from pyquery import PyQuery as pq
11+ from threading import Thread
12+ from time import time
1213
1314# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
1415def decode_page (page_bytes , charset = 'utf-8' ):
@@ -72,6 +73,36 @@ def collect_data(page_html, start_index):
7273 else :
7374 break
7475
76+ # multi-thread analyze data with pyquery
77+ def multi_thread_collect_data (page_html , start_index ):
78+ html_query = pq (page_html )
79+ query_index = start_index
80+ DB_threads = []
81+ while (1 ):
82+ href_list = html_query ('a' ).eq (query_index ).attr ('href' )
83+ book_name = pq (html_query ('a' ).eq (query_index )).find ('.hanghang-list-name' ).text ()
84+ re_book_name = re .sub (r'\'+' , ' ' , book_name )
85+ final_book_name = re .sub (r'\"+' , ' ' , re_book_name )
86+ book_download_num = pq (html_query ('a' ).eq (query_index )).find ('.hanghang-list-num' ).text ()
87+ book_author = pq (html_query ('a' ).eq (query_index )).find ('.hanghang-list-zuozhe' ).text ()
88+ re_book_author = re .sub (r'\'+' , ' ' , book_author )
89+ final_book_author = re .sub (r'\"+' , ' ' , re_book_author )
90+ if book_name :
91+ query_index = query_index + 1
92+ #print("book_name: %s ,book num: %s ,book_author: %s, book link: %s" % (book_name, book_download_num, book_author, href_list))
93+
94+ # multi_thread database store all info
95+ t = Thread (target = store_data , args = (final_book_name , final_book_author , book_download_num , href_list ))
96+ DB_threads .append (t )
97+ t .start ()
98+ #store_data(final_book_name, final_book_author, book_download_num, href_list)
99+ else :
100+ break
101+ # wait for all DB operation finish
102+ for t in DB_threads :
103+ t .join ()
104+
105+
75106# store data into pymysql
76107def store_data (g_name , g_author , g_downloadcount , g_link ):
77108 db = pymysql .connect (host = "localhost" , user = "root" , password = "123456" , database = "testdb" , charset = "utf8" )
@@ -169,6 +200,15 @@ def get_final_url(select_results):
169200 file_handle .write ('\n ' )
170201 file_handle .close ()
171202
203+ def multi_thread_get_html (url_unit , header , queue_num ):
204+ # get page encode
205+ page_encode = get_page_encode (url_unit , header )
206+ # get page html
207+ page_html = get_page_html (url_unit , header , 3 , page_encode )
208+ # get html data
209+ multi_thread_collect_data (page_html , queue_num )
210+
211+
172212def sub_sort (array ,low ,high ):
173213 key = array [low ]
174214 while low < high :
@@ -191,6 +231,7 @@ def quick_sort(array,low,high):
191231if __name__ == "__main__" :
192232 #url = 'https://www.671cf.com/htm/index.htm'
193233 #url = 'https://www.gavbus.com/'
234+ start = time ()
194235 url = 'http://www.ireadweek.com/'
195236 header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36' }
196237 ssl ._create_default_https_context = ssl ._create_unverified_context
@@ -199,7 +240,8 @@ def quick_sort(array,low,high):
199240 search_list = []
200241 search_list = get_whole_page_url (url , header )
201242 cycle_flag = 0
202- for url_unit in search_list :
243+ GH_threads = []
244+ '''for url_unit in search_list:
203245 print ("---------url-------", url_unit)
204246 if cycle_flag:
205247 queue_num = 7
@@ -212,11 +254,26 @@ def quick_sort(array,low,high):
212254 page_html = get_page_html(url_unit, header, 3, page_encode)
213255 # get html data
214256 collect_data(page_html, queue_num)
257+ '''
258+ for url_unit in search_list :
259+ print ("---------url-------" , url_unit )
260+ if cycle_flag :
261+ queue_num = 7
262+ else :
263+ cycle_flag = 1
264+ queue_num = 9
265+ t = Thread (target = multi_thread_get_html , args = (url_unit , header , queue_num ))
266+ GH_threads .append (t )
267+ t .start ()
268+ for t in GH_threads :
269+ t .join ()
215270
216271 results = select_data_from_mysql ()
217272 #draw_data_matplot(results)
218273 get_final_url (results )
219274 draw_data_echart (results )
275+ end = time ()
276+ print ('cost time is %.3f s ' % (end - start ))
220277
221278 '''
222279 # test mysql update
0 commit comments