Skip to content

Commit 76c30c5

Browse files
authored
Merge pull request #53 from Ericyuanhui/test
change thread model add timeout add exception handle
2 parents 2e69cc3 + 89dda96 commit 76c30c5

File tree

1 file changed

+37
-56
lines changed

1 file changed

+37
-56
lines changed

Language/python/crawl_web/crawl_web.py

Lines changed: 37 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,21 @@
1010
from pyquery import PyQuery as pq
1111
from threading import Thread
1212
from time import time
13+
from goto import with_goto
1314

1415
# 通过指定的字符集对页面进行解码(不是每个网站都将字符集设置为utf-8)
1516
def decode_page(page_bytes, charset='utf-8'):
1617
page_html = None
1718
try:
1819
page_html = page_bytes.decode(charset)
19-
2020
except UnicodeDecodeError:
2121
pass
2222
# logging.error('Decode:', error)
2323
return page_html
2424

2525
# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
2626
def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
27+
print ("----page html-------")
2728
page_html = None
2829
try:
2930
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers = header_url)).read()
@@ -36,6 +37,7 @@ def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
3637

3738
# 获得页面的编码格式
3839
def get_page_encode(seed_url, header_url):
40+
print ("----page encode-------")
3941
page_encode = None
4042
try:
4143
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers=header_url)).read()
@@ -89,20 +91,17 @@ def multi_thread_collect_data(page_html, start_index):
8991
final_book_author = re.sub(r'\"+', ' ', re_book_author)
9092
if book_name:
9193
query_index = query_index + 1
92-
#print("book_name: %s ,book num: %s ,book_author: %s, book link: %s" % (book_name, book_download_num, book_author, href_list))
93-
94+
print("book_name: %s ,book num: %s ,book_author: %s, book link: %s" % (book_name, book_download_num, book_author, href_list))
9495
# multi_thread database store all info
9596
t = Thread(target=store_data, args=(final_book_name, final_book_author, book_download_num, href_list))
9697
DB_threads.append(t)
9798
t.start()
98-
#store_data(final_book_name, final_book_author, book_download_num, href_list)
9999
else:
100100
break
101101
# wait for all DB operation finish
102102
for t in DB_threads:
103103
t.join()
104104

105-
106105
# store data into pymysql
107106
def store_data(g_name, g_author, g_downloadcount, g_link):
108107
db = pymysql.connect(host = "localhost", user = "root", password = "123456", database= "testdb", charset="utf8")
@@ -123,26 +122,27 @@ def store_data(g_name, g_author, g_downloadcount, g_link):
123122
db.commit()
124123
db.close()
125124

126-
def get_whole_page_url(header_url, header):
127-
list_url = []
128-
page_number = 1
129-
while(1):
130-
test_url = header_url + 'index.php/index/' + str(page_number) + '.html'
131-
test_header = header
132-
try:
133-
test_html = urllib.request.urlopen(urllib.request.Request(test_url, headers=test_header)).read()
134-
test_query = pq(test_html)
135-
test_name = pq(test_query('a').eq(7)).find('.hanghang-list-name').text()
136-
if test_name:
137-
page_number = page_number + 1
138-
list_url.append(test_url)
139-
#print ("list name ", test_name)
140-
#time.sleep(2)
125+
@with_goto
126+
def get_url_request_handle(header_url, header):
127+
retry_count = 3
128+
first_url = header_url + 'index.php/index/1.html'
129+
label .retry
130+
try:
131+
first_html = urllib.request.urlopen(urllib.request.Request(first_url, headers=header), timeout=0.5).read()
132+
except Exception as e:
133+
if str(e) == "timed out":
134+
if (retry_count > 0):
135+
print ("urlopen error timed out retry!")
136+
retry_count = retry_count - 1
137+
goto .retry
141138
else:
142-
break
143-
except URLError:
144-
break
145-
return list_url
139+
raise Exception('urlopen error timed out more than three times!')
140+
else:
141+
raise Exception('exception error!')
142+
html_query = pq(first_html)
143+
link_list = html_query('.action-pagination')
144+
page_number = int(link_list.find('a').eq(6).attr('href').split('/')[3].split('.')[0])
145+
return page_number
146146

147147
# select mysql data to chart
148148
def select_data_from_mysql():
@@ -165,8 +165,6 @@ def draw_data_matplot(select_results):
165165
for select_list in select_results:
166166
list_name.append(select_list[0])
167167
list_count.append(int(select_list[2]))
168-
#print ("select name: %s, select count: %d" % (select_list[0], int(select_list[2])))
169-
#plt.plot(list_count, 'bs')
170168
quick_sort(list_count, 0, len(list_count)-1)
171169
for i in list_count:
172170
print ("quick sort: ", i)
@@ -208,7 +206,6 @@ def multi_thread_get_html(url_unit, header, queue_num):
208206
# get html data
209207
multi_thread_collect_data(page_html, queue_num)
210208

211-
212209
def sub_sort(array,low,high):
213210
key = array[low]
214211
while low < high:
@@ -227,36 +224,19 @@ def quick_sort(array,low,high):
227224
quick_sort(array,low,key_index)
228225
quick_sort(array,key_index+1,high)
229226

230-
231227
if __name__ == "__main__":
232-
#url = 'https://www.671cf.com/htm/index.htm'
233-
#url = 'https://www.gavbus.com/'
228+
234229
start = time()
235230
url = 'http://www.ireadweek.com/'
236231
header = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36'}
237232
ssl._create_default_https_context = ssl._create_unverified_context
238233

239-
# get all page url list
240-
search_list = []
241-
search_list = get_whole_page_url(url, header)
234+
# get all page url count
235+
page_count = get_url_request_handle(url, header)
242236
cycle_flag = 0
243237
GH_threads = []
244-
'''for url_unit in search_list:
245-
print ("---------url-------", url_unit)
246-
if cycle_flag:
247-
queue_num = 7
248-
else:
249-
cycle_flag = 1
250-
queue_num = 9
251-
# get page encode
252-
page_encode = get_page_encode(url_unit, header)
253-
# get page html
254-
page_html = get_page_html(url_unit, header, 3, page_encode)
255-
# get html data
256-
collect_data(page_html, queue_num)
257-
'''
258-
for url_unit in search_list:
259-
print ("---------url-------", url_unit)
238+
for n in range(1, int(page_count)):
239+
url_unit = "http://www.ireadweek.com/index.php/index/" + str(n) + ".html"
260240
if cycle_flag:
261241
queue_num = 7
262242
else:
@@ -275,19 +255,20 @@ def quick_sort(array,low,high):
275255
end = time()
276256
print('cost time is %.3f s ' %(end - start))
277257

278-
'''
279-
# test mysql update
258+
'''# -----test-----
280259
#test_url = 'http://www.ireadweek.com/index.php/index/16.html'
281260
#test_url = 'http://pan.baidu.com/s/1qY91y0G'
282-
test_url = 'http://www.ireadweek.com/index.php/bookInfo/11043.html'
283-
261+
test_url = 'http://www.ireadweek.com/index.php/index/1.html'
284262
page_encode = get_page_encode(test_url, header)
285263
page_html = get_page_html(test_url, header, 3, page_encode)
286264
#collect_data(page_html, 9)
287265
html_query = pq(page_html)
288-
link_list = html_query('.hanghang-shu-content-btn')
289-
print (link_list.find('a').attr('href'))
290-
print (type(link_list.find('a').attr('href')))
266+
link_list = html_query('.action-pagination')
267+
print (link_list)
268+
print (link_list.find('a').eq(6).attr('href'))
269+
number = link_list.find('a').eq(6).attr('href')
270+
print (number.split('/')[3].split('.')[0])
271+
print (type(int(number.split('/')[3].split('.')[0])))
291272
'''
292273

293274

0 commit comments

Comments
 (0)