Skip to content

Commit 2e69cc3

Browse files
authored
Merge pull request #52 from Ericyuanhui/test
add muti thread
2 parents 5bebcb9 + 145f84a commit 2e69cc3

File tree

1 file changed

+59
-2
lines changed

1 file changed

+59
-2
lines changed

Language/python/crawl_web/crawl_web.py

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
import pymysql
66
import re
77
import matplotlib.pyplot as plt
8-
import time
98

109
from pyecharts import Bar
1110
from pyquery import PyQuery as pq
11+
from threading import Thread
12+
from time import time
1213

1314
# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
1415
def decode_page(page_bytes, charset='utf-8'):
@@ -72,6 +73,36 @@ def collect_data(page_html, start_index):
7273
else:
7374
break
7475

76+
# multi-thread analyze data with pyquery
77+
def multi_thread_collect_data(page_html, start_index):
78+
html_query = pq(page_html)
79+
query_index = start_index
80+
DB_threads = []
81+
while (1):
82+
href_list = html_query('a').eq(query_index).attr('href')
83+
book_name = pq(html_query('a').eq(query_index)).find('.hanghang-list-name').text()
84+
re_book_name = re.sub(r'\'+', ' ', book_name)
85+
final_book_name = re.sub(r'\"+', ' ', re_book_name)
86+
book_download_num = pq(html_query('a').eq(query_index)).find('.hanghang-list-num').text()
87+
book_author = pq(html_query('a').eq(query_index)).find('.hanghang-list-zuozhe').text()
88+
re_book_author = re.sub(r'\'+', ' ', book_author)
89+
final_book_author = re.sub(r'\"+', ' ', re_book_author)
90+
if book_name:
91+
query_index = query_index + 1
92+
#print("book_name: %s ,book num: %s ,book_author: %s, book link: %s" % (book_name, book_download_num, book_author, href_list))
93+
94+
# multi_thread database store all info
95+
t = Thread(target=store_data, args=(final_book_name, final_book_author, book_download_num, href_list))
96+
DB_threads.append(t)
97+
t.start()
98+
#store_data(final_book_name, final_book_author, book_download_num, href_list)
99+
else:
100+
break
101+
# wait for all DB operation finish
102+
for t in DB_threads:
103+
t.join()
104+
105+
75106
# store data into pymysql
76107
def store_data(g_name, g_author, g_downloadcount, g_link):
77108
db = pymysql.connect(host = "localhost", user = "root", password = "123456", database= "testdb", charset="utf8")
@@ -169,6 +200,15 @@ def get_final_url(select_results):
169200
file_handle.write('\n')
170201
file_handle.close()
171202

203+
def multi_thread_get_html(url_unit, header, queue_num):
204+
# get page encode
205+
page_encode = get_page_encode(url_unit, header)
206+
# get page html
207+
page_html = get_page_html(url_unit, header, 3, page_encode)
208+
# get html data
209+
multi_thread_collect_data(page_html, queue_num)
210+
211+
172212
def sub_sort(array,low,high):
173213
key = array[low]
174214
while low < high:
@@ -191,6 +231,7 @@ def quick_sort(array,low,high):
191231
if __name__ == "__main__":
192232
#url = 'https://www.671cf.com/htm/index.htm'
193233
#url = 'https://www.gavbus.com/'
234+
start = time()
194235
url = 'http://www.ireadweek.com/'
195236
header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
196237
ssl._create_default_https_context = ssl._create_unverified_context
@@ -199,7 +240,8 @@ def quick_sort(array,low,high):
199240
search_list = []
200241
search_list = get_whole_page_url(url, header)
201242
cycle_flag = 0
202-
for url_unit in search_list:
243+
GH_threads = []
244+
'''for url_unit in search_list:
203245
print ("---------url-------", url_unit)
204246
if cycle_flag:
205247
queue_num = 7
@@ -212,11 +254,26 @@ def quick_sort(array,low,high):
212254
page_html = get_page_html(url_unit, header, 3, page_encode)
213255
# get html data
214256
collect_data(page_html, queue_num)
257+
'''
258+
for url_unit in search_list:
259+
print ("---------url-------", url_unit)
260+
if cycle_flag:
261+
queue_num = 7
262+
else:
263+
cycle_flag = 1
264+
queue_num = 9
265+
t = Thread(target=multi_thread_get_html, args=(url_unit, header, queue_num))
266+
GH_threads.append(t)
267+
t.start()
268+
for t in GH_threads:
269+
t.join()
215270

216271
results = select_data_from_mysql()
217272
#draw_data_matplot(results)
218273
get_final_url(results)
219274
draw_data_echart(results)
275+
end = time()
276+
print('cost time is %.3f s ' %(end - start))
220277

221278
'''
222279
# test mysql update

0 commit comments

Comments
 (0)