11
22import urllib .request
33import chardet
4- import builtwith
54import ssl
65import pymysql
6+ import re
7+ import matplotlib .pyplot as plt
78import time
89
10+ from pyecharts import Bar
911from pyquery import PyQuery as pq
1012
1113# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
@@ -22,7 +24,6 @@ def decode_page(page_bytes, charset='utf-8'):
2224# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
2325def get_page_html (seed_url , header_url , retry_times = 3 , charset = 'utf-8' ):
2426 page_html = None
25- page_bytes = None
2627 try :
2728 page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url )).read ()
2829 page_html = decode_page (page_bytes , charset )
@@ -34,7 +35,6 @@ def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
3435
3536# 获得页面的编码格式
3637def get_page_encode (seed_url , header_url ):
37- page_bytes = None
3838 page_encode = None
3939 try :
4040 page_bytes = urllib .request .urlopen (urllib .request .Request (seed_url , headers = header_url )).read ()
@@ -54,18 +54,21 @@ def file_store(file_path, page_bytes):
5454
5555# analyze data with pyquery
5656def collect_data (page_html , start_index ):
57- html_query = None
5857 html_query = pq (page_html )
5958 query_index = start_index
6059 while (1 ):
6160 href_list = html_query ('a' ).eq (query_index ).attr ('href' )
6261 book_name = pq (html_query ('a' ).eq (query_index )).find ('.hanghang-list-name' ).text ()
62+ re_book_name = re .sub (r'\'+' , ' ' , book_name )
63+ final_book_name = re .sub (r'\"+' , ' ' , re_book_name )
6364 book_download_num = pq (html_query ('a' ).eq (query_index )).find ('.hanghang-list-num' ).text ()
6465 book_author = pq (html_query ('a' ).eq (query_index )).find ('.hanghang-list-zuozhe' ).text ()
66+ re_book_author = re .sub (r'\'+' , ' ' , book_author )
67+ final_book_author = re .sub (r'\"+' , ' ' , re_book_author )
6568 if book_name :
6669 query_index = query_index + 1
6770 print ("book_name: %s ,book num: %s ,book_author: %s, book link: %s" % (book_name , book_download_num , book_author , href_list ))
68- store_data (book_name , book_author , book_download_num , href_list )
71+ store_data (final_book_name , final_book_author , book_download_num , href_list )
6972 else :
7073 break
7174
@@ -110,6 +113,69 @@ def get_whole_page_url(header_url, header):
110113 break
111114 return list_url
112115
116+ # select mysql data to chart
117+ def select_data_from_mysql ():
118+ select_db = pymysql .connect (host = "localhost" , user = "root" , password = "123456" , database = "testdb" , charset = "utf8" )
119+ select_cursor = select_db .cursor ()
120+ select_sql = "SELECT * FROM ireadlist WHERE downloadcount > 6000"
121+ try :
122+ # 执行sql语句
123+ select_cursor .execute (select_sql )
124+ select_results = select_cursor .fetchall ()
125+
126+ #for select_list in select_results:
127+ # print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
128+ except :
129+ print ("select error msg" )
130+ select_db .close ()
131+ return select_results
132+
133+ # draw data matplot
134+ def draw_data_matplot (select_results ):
135+ list_name = []
136+ list_count = []
137+ for select_list in select_results :
138+ list_name .append (select_list [0 ])
139+ list_count .append (int (select_list [2 ]))
140+ #print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
141+ #plt.plot(list_count, 'bs')
142+ quick_sort (list_count , 0 , len (list_count )- 1 )
143+ for i in list_count :
144+ print ("quick sort: " , i )
145+ plt .plot (list_count , 'bs' )
146+ plt .show ()
147+
148+ # draw data echart
149+ def draw_data_echart (select_results ):
150+ list_name = []
151+ list_count = []
152+ for select_list in select_results :
153+ list_name .append (select_list [0 ])
154+ list_count .append (int (select_list [2 ]))
155+ #print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
156+ bar = Bar ("read weekly" , "download count" )
157+ bar .use_theme ('light' )
158+ bar .add ("book download count" , list_name , list_count , is_more_utils = True , is_label_show = True , is_datazoom_show = True )
159+ bar .render ("downloadcount.html" )
160+
161+ def sub_sort (array ,low ,high ):
162+ key = array [low ]
163+ while low < high :
164+ while low < high and array [high ] >= key :
165+ high -= 1
166+ while low < high and array [high ] < key :
167+ array [low ] = array [high ]
168+ low += 1
169+ array [high ] = array [low ]
170+ array [low ] = key
171+ return low
172+
173+ def quick_sort (array ,low ,high ):
174+ if low < high :
175+ key_index = sub_sort (array ,low ,high )
176+ quick_sort (array ,low ,key_index )
177+ quick_sort (array ,key_index + 1 ,high )
178+
113179
114180if __name__ == "__main__" :
115181 #url = 'https://www.671cf.com/htm/index.htm'
@@ -135,17 +201,27 @@ def get_whole_page_url(header_url, header):
135201 page_html = get_page_html (url_unit , header , 3 , page_encode )
136202 # get html data
137203 collect_data (page_html , queue_num )
138- '''
204+
205+ results = select_data_from_mysql ()
206+ #draw_data_matplot(results)
207+ draw_data_echart (results )
139208
140209 # test mysql update
141- test_url = 'http://www.ireadweek.com/index.php/index/16.html'
210+ #test_url = 'http://www.ireadweek.com/index.php/index/16.html'
211+ '''test_url = 'http://www.ireadweek.com/index.php/index/168.html'
142212 page_encode = get_page_encode(test_url, header)
143213
144214 page_html = get_page_html(test_url, header, 3, page_encode)
145215
146216 collect_data(page_html, 9)
147217 '''
148218
219+ '''plt.plot([1, 2, 3, 4])
220+ plt.ylabel('some numbers')
221+ plt.show()
222+ '''
223+
224+
149225
150226
151227
0 commit comments