Skip to content

Commit 12e0d72

Browse files
committed
update new feature
Signed-off-by: ericyuanhui <285521263@qq.com>
1 parent 2b3b157 commit 12e0d72

File tree

1 file changed

+83
-7
lines changed

1 file changed

+83
-7
lines changed

Language/python/crawl_web/crawl_web.py

Lines changed: 83 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11

22
import urllib.request
33
import chardet
4-
import builtwith
54
import ssl
65
import pymysql
6+
import re
7+
import matplotlib.pyplot as plt
78
import time
89

10+
from pyecharts import Bar
911
from pyquery import PyQuery as pq
1012

1113
# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
@@ -22,7 +24,6 @@ def decode_page(page_bytes, charset='utf-8'):
2224
# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
2325
def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
2426
page_html = None
25-
page_bytes = None
2627
try:
2728
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers = header_url)).read()
2829
page_html = decode_page(page_bytes, charset)
@@ -34,7 +35,6 @@ def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
3435

3536
# 获得页面的编码格式
3637
def get_page_encode(seed_url, header_url):
37-
page_bytes = None
3838
page_encode = None
3939
try:
4040
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers=header_url)).read()
@@ -54,18 +54,21 @@ def file_store(file_path, page_bytes):
5454

5555
# analyze data with pyquery
5656
def collect_data(page_html, start_index):
57-
html_query = None
5857
html_query = pq(page_html)
5958
query_index = start_index
6059
while (1):
6160
href_list = html_query('a').eq(query_index).attr('href')
6261
book_name = pq(html_query('a').eq(query_index)).find('.hanghang-list-name').text()
62+
re_book_name = re.sub(r'\'+', ' ', book_name)
63+
final_book_name = re.sub(r'\"+', ' ', re_book_name)
6364
book_download_num = pq(html_query('a').eq(query_index)).find('.hanghang-list-num').text()
6465
book_author = pq(html_query('a').eq(query_index)).find('.hanghang-list-zuozhe').text()
66+
re_book_author = re.sub(r'\'+', ' ', book_author)
67+
final_book_author = re.sub(r'\"+', ' ', re_book_author)
6568
if book_name:
6669
query_index = query_index + 1
6770
print ("book_name: %s ,book num: %s ,book_author: %s, book link: %s" %(book_name, book_download_num, book_author, href_list))
68-
store_data(book_name, book_author, book_download_num, href_list)
71+
store_data(final_book_name, final_book_author, book_download_num, href_list)
6972
else:
7073
break
7174

@@ -110,6 +113,69 @@ def get_whole_page_url(header_url, header):
110113
break
111114
return list_url
112115

116+
# select mysql data to chart
117+
def select_data_from_mysql():
118+
select_db = pymysql.connect(host="localhost", user="root", password="123456", database="testdb", charset="utf8")
119+
select_cursor = select_db.cursor()
120+
select_sql = "SELECT * FROM ireadlist WHERE downloadcount > 6000"
121+
try:
122+
# 执行sql语句
123+
select_cursor.execute(select_sql)
124+
select_results = select_cursor.fetchall()
125+
126+
#for select_list in select_results:
127+
# print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
128+
except:
129+
print ("select error msg")
130+
select_db.close()
131+
return select_results
132+
133+
# draw data matplot
134+
def draw_data_matplot(select_results):
135+
list_name = []
136+
list_count = []
137+
for select_list in select_results:
138+
list_name.append(select_list[0])
139+
list_count.append(int(select_list[2]))
140+
#print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
141+
#plt.plot(list_count, 'bs')
142+
quick_sort(list_count, 0, len(list_count)-1)
143+
for i in list_count:
144+
print ("quick sort: ", i)
145+
plt.plot(list_count, 'bs')
146+
plt.show()
147+
148+
# draw data echart
149+
def draw_data_echart(select_results):
150+
list_name = []
151+
list_count = []
152+
for select_list in select_results:
153+
list_name.append(select_list[0])
154+
list_count.append(int(select_list[2]))
155+
#print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
156+
bar = Bar("read weekly", "download count")
157+
bar.use_theme('light')
158+
bar.add("book download count", list_name, list_count, is_more_utils = True, is_label_show = True, is_datazoom_show = True)
159+
bar.render("downloadcount.html")
160+
161+
def sub_sort(array,low,high):
162+
key = array[low]
163+
while low < high:
164+
while low < high and array[high] >= key:
165+
high -= 1
166+
while low < high and array[high] < key:
167+
array[low] = array[high]
168+
low += 1
169+
array[high] = array[low]
170+
array[low] = key
171+
return low
172+
173+
def quick_sort(array,low,high):
174+
if low < high:
175+
key_index = sub_sort(array,low,high)
176+
quick_sort(array,low,key_index)
177+
quick_sort(array,key_index+1,high)
178+
113179

114180
if __name__ == "__main__":
115181
#url = 'https://www.671cf.com/htm/index.htm'
@@ -135,17 +201,27 @@ def get_whole_page_url(header_url, header):
135201
page_html = get_page_html(url_unit, header, 3, page_encode)
136202
# get html data
137203
collect_data(page_html, queue_num)
138-
'''
204+
205+
results = select_data_from_mysql()
206+
#draw_data_matplot(results)
207+
draw_data_echart(results)
139208

140209
# test mysql update
141-
test_url = 'http://www.ireadweek.com/index.php/index/16.html'
210+
#test_url = 'http://www.ireadweek.com/index.php/index/16.html'
211+
'''test_url = 'http://www.ireadweek.com/index.php/index/168.html'
142212
page_encode = get_page_encode(test_url, header)
143213
144214
page_html = get_page_html(test_url, header, 3, page_encode)
145215
146216
collect_data(page_html, 9)
147217
'''
148218

219+
'''plt.plot([1, 2, 3, 4])
220+
plt.ylabel('some numbers')
221+
plt.show()
222+
'''
223+
224+
149225

150226

151227

0 commit comments

Comments
 (0)