Skip to content

Commit f1c21d9

Browse files
committed
finish all
Signed-off-by: ericyuanhui <285521263@qq.com>
1 parent 3aee196 commit f1c21d9

File tree

1 file changed

+24
-15
lines changed

1 file changed

+24
-15
lines changed

Language/python/crawl_web/crawl_web.py

Lines changed: 24 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,13 @@ def decode_page(page_bytes, charset='utf-8'):
2828
# 获取页面的HTML代码(通过递归实现指定次数的重试操作)
2929
@with_goto
3030
def get_page_html(seed_url, header_url, retry_times=3, charset='utf-8'):
31-
print ("----page html-------")
31+
#print ("----page html-------")
3232
page_html = None
3333
label .get_page_retry
3434
try:
3535
page_bytes = urllib.request.urlopen(urllib.request.Request(seed_url, headers = header_url), timeout=10).read()
3636
page_html = decode_page(page_bytes, charset)
3737
except Exception as e:
38-
print ("-------exception-----: " ,str(e))
3938
if ((str(e) == "timed out") or (str(e) == "<urlopen error timed out>")):
4039
if (retry_times > 0):
4140
print ("urlopen error timed out retry!")
@@ -220,10 +219,10 @@ def multi_thread_get_html(url_unit, header, queue_num):
220219
print ("get html timed out! append the url list!")
221220
url_list.append(url_unit)
222221
else:
223-
print("-----------finish get html ----------")
224-
return 0
222+
#print("-----------finish get html ----------")
223+
#return 0
225224
# get html data
226-
#multi_thread_collect_data(page_html, queue_num)
225+
multi_thread_collect_data(page_html, queue_num)
227226

228227
def sub_sort(array,low,high):
229228
key = array[low]
@@ -266,15 +265,29 @@ def quick_sort(array,low,high):
266265
t.start()
267266
for t in GH_threads:
268267
t.join()
268+
269+
# check if url list is empty
269270
print ("url check list is : ", url_list)
270-
'''
271-
results = select_data_from_mysql()
272-
#draw_data_matplot(results)
273-
get_final_url(results)
274-
draw_data_echart(results)
271+
retry_list = url_list
272+
RE_threads = []
273+
if retry_list:
274+
for retry_n in retry_list:
275+
url_list.remove(retry_n)
276+
t = Thread(target=multi_thread_get_html, args=(retry_n, header, 9))
277+
RE_threads.append(t)
278+
t.start()
279+
for t in RE_threads:
280+
t.join()
281+
282+
else:
283+
results = select_data_from_mysql()
284+
#draw_data_matplot(results)
285+
get_final_url(results)
286+
draw_data_echart(results)
287+
print ("url retry check list is : ", url_list)
275288
end = time()
276289
print('cost time is %.3f s ' %(end - start))
277-
'''
290+
278291
'''# -----test-----
279292
#test_url = 'http://www.ireadweek.com/index.php/index/16.html'
280293
#test_url = 'http://pan.baidu.com/s/1qY91y0G'
@@ -290,7 +303,3 @@ def quick_sort(array,low,high):
290303
print (number.split('/')[3].split('.')[0])
291304
print (type(int(number.split('/')[3].split('.')[0])))
292305
'''
293-
294-
295-
296-

0 commit comments

Comments
 (0)