Skip to content

Commit cfc484b

Browse files
author
freezer712
committed
ULC_notice
1 parent a24634c commit cfc484b

File tree

2 files changed

+108
-0
lines changed

2 files changed

+108
-0
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,7 @@
11
# python_programs
22
Some recently completed simple python programs
3+
4+
###### 1、ULC_notice.py
5+
6+
用于爬取南大青年上的公告通知,并以excel表格的形式存储
7+

codes/ULC_notice.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import sys
2+
import requests
3+
from bs4 import BeautifulSoup
4+
import xlsxwriter
5+
import time
6+
7+
8+
# 爬取南大青年的公告通知,保存标题,时间,链接3部分内容
9+
# 读入url输出页面内容
10+
def grab_information(url):
11+
r = requests.get(url)
12+
if r.status_code == 200:
13+
r.encoding = 'utf-8'
14+
return r.text
15+
else:
16+
print("无法爬取网页信息")
17+
sys.exit(0)
18+
19+
20+
# 读入网页源码,输出信息列表
21+
# 将通知标题、时间、链接汇总成列表输出
22+
def parse(html_page, records_per_page):
23+
main_text = html_page.split('<div frag="窗口6" portletmode="simpleList">')[1].split(' <div id="wp_paging_w6"> ')[0]
24+
soup = BeautifulSoup(main_text, 'html.parser')
25+
list_info = []
26+
tag_li = soup.li
27+
for i in range(records_per_page):
28+
href = tag_li.a.get('href')
29+
if "/page.htm" in href:
30+
href = "https://tuanwei.nju.edu.cn" + href
31+
title = tag_li.a.string
32+
p_time = tag_li.span.find_next_sibling().string
33+
list_info.append([title, p_time, href])
34+
tag_li = tag_li.find_next_sibling()
35+
return list_info
36+
37+
38+
# 从网页提取总页面数、总记录数,每页记录数
39+
# 读入网页,输出总页面数、总记录数,每页记录数
40+
def parse_pcm(html):
41+
pcm_text = html.split('<ul class="wp_paging clearfix"> ')[1].split('<li class="page_nav">')[0]
42+
pcm_soup = BeautifulSoup(pcm_text, 'html.parser')
43+
records_per_page = int(pcm_soup.em.string)
44+
total_records = int(pcm_soup.span.find_next_sibling().em.string)
45+
total_pages = int(total_records) // int(records_per_page) + 1
46+
return total_pages, total_records, records_per_page
47+
48+
49+
# 列表信息以xlsx表格的形式存储
50+
# 读入列表,输出xlsx表格总记录条数
51+
# 表格要求有标题
52+
# 1-4列分别是序号,通知标题,发布时间,网页链接
53+
def storage(info_list):
54+
#标题
55+
chart_title = '南大青年网站公告通知'
56+
c_time = "创建时间:" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
57+
xls2 = xlsxwriter.Workbook(chart_title + " " + time.strftime("%Y%m%d", time.localtime()) + '.xlsx')
58+
sht1 = xls2.add_worksheet()
59+
# 添加字段
60+
sht1.write(0, 0, chart_title)
61+
sht1.write(1, 0, c_time)
62+
sht1.write(2, 0, '序号')
63+
sht1.write(2, 1, '标题')
64+
sht1.write(2, 2, '时间')
65+
sht1.write(2, 3, '网址')
66+
# 给字段中加值 使用循环
67+
for i in range(len(info_list)):
68+
sht1.write(i + 3, 0, i + 1)
69+
sht1.write(i + 3, 1, info_list[i][0])
70+
sht1.write(i + 3, 2, info_list[i][1])
71+
sht1.write(i + 3, 3, info_list[i][2])
72+
xls2.close()
73+
74+
75+
def main():
76+
first_url = 'https://tuanwei.nju.edu.cn/ggtz/list1.htm'
77+
url_prefix, url_suffix = first_url.split("1")
78+
html = grab_information(first_url)
79+
#先从第一个页面爬取总记录数,总页数,
80+
total_pages, total_records, records_per_page = parse_pcm(html)
81+
print('已爬取基本信息')
82+
info_list = parse(html, records_per_page)
83+
print('------------------已解析第1页内容--------------------')
84+
for i in range(2, total_pages):
85+
url = url_prefix + str(i) + url_suffix
86+
html = grab_information(url)
87+
info_list += parse(html, records_per_page)
88+
print('------------------已解析第' + str(i) + '页内容--------------------')
89+
url = url_prefix + str(total_pages) + url_suffix
90+
html = grab_information(url)
91+
info_list += parse(html, total_records - (total_pages - 1) * records_per_page)
92+
# 判断info_list中的条目数是否与总记录数相等,有问题就重新爬取
93+
if len(info_list) != total_records:
94+
print('--------------------总数异常,将重新爬取内容-------------------------')
95+
main()
96+
else:
97+
print('------------------总数核对无误,正在将内容存入表格---------------------')
98+
storage(info_list)
99+
print('存储完成')
100+
101+
102+
if __name__ == '__main__':
103+
main()

0 commit comments

Comments
 (0)