|
| 1 | +import sys |
| 2 | +import requests |
| 3 | +from bs4 import BeautifulSoup |
| 4 | +import xlsxwriter |
| 5 | +import time |
| 6 | + |
| 7 | + |
| 8 | +# 爬取南大青年的公告通知,保存标题,时间,链接3部分内容 |
| 9 | +# 读入url输出页面内容 |
| 10 | +def grab_information(url): |
| 11 | + r = requests.get(url) |
| 12 | + if r.status_code == 200: |
| 13 | + r.encoding = 'utf-8' |
| 14 | + return r.text |
| 15 | + else: |
| 16 | + print("无法爬取网页信息") |
| 17 | + sys.exit(0) |
| 18 | + |
| 19 | + |
| 20 | +# 读入网页源码,输出信息列表 |
| 21 | +# 将通知标题、时间、链接汇总成列表输出 |
| 22 | +def parse(html_page, records_per_page): |
| 23 | + main_text = html_page.split('<div frag="窗口6" portletmode="simpleList">')[1].split(' <div id="wp_paging_w6"> ')[0] |
| 24 | + soup = BeautifulSoup(main_text, 'html.parser') |
| 25 | + list_info = [] |
| 26 | + tag_li = soup.li |
| 27 | + for i in range(records_per_page): |
| 28 | + href = tag_li.a.get('href') |
| 29 | + if "/page.htm" in href: |
| 30 | + href = "https://tuanwei.nju.edu.cn" + href |
| 31 | + title = tag_li.a.string |
| 32 | + p_time = tag_li.span.find_next_sibling().string |
| 33 | + list_info.append([title, p_time, href]) |
| 34 | + tag_li = tag_li.find_next_sibling() |
| 35 | + return list_info |
| 36 | + |
| 37 | + |
| 38 | +# 从网页提取总页面数、总记录数,每页记录数 |
| 39 | +# 读入网页,输出总页面数、总记录数,每页记录数 |
| 40 | +def parse_pcm(html): |
| 41 | + pcm_text = html.split('<ul class="wp_paging clearfix"> ')[1].split('<li class="page_nav">')[0] |
| 42 | + pcm_soup = BeautifulSoup(pcm_text, 'html.parser') |
| 43 | + records_per_page = int(pcm_soup.em.string) |
| 44 | + total_records = int(pcm_soup.span.find_next_sibling().em.string) |
| 45 | + total_pages = int(total_records) // int(records_per_page) + 1 |
| 46 | + return total_pages, total_records, records_per_page |
| 47 | + |
| 48 | + |
| 49 | +# 列表信息以xlsx表格的形式存储 |
| 50 | +# 读入列表,输出xlsx表格总记录条数 |
| 51 | +# 表格要求有标题 |
| 52 | +# 1-4列分别是序号,通知标题,发布时间,网页链接 |
| 53 | +def storage(info_list): |
| 54 | + #标题 |
| 55 | + chart_title = '南大青年网站公告通知' |
| 56 | + c_time = "创建时间:" + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) |
| 57 | + xls2 = xlsxwriter.Workbook(chart_title + " " + time.strftime("%Y%m%d", time.localtime()) + '.xlsx') |
| 58 | + sht1 = xls2.add_worksheet() |
| 59 | + # 添加字段 |
| 60 | + sht1.write(0, 0, chart_title) |
| 61 | + sht1.write(1, 0, c_time) |
| 62 | + sht1.write(2, 0, '序号') |
| 63 | + sht1.write(2, 1, '标题') |
| 64 | + sht1.write(2, 2, '时间') |
| 65 | + sht1.write(2, 3, '网址') |
| 66 | + # 给字段中加值 使用循环 |
| 67 | + for i in range(len(info_list)): |
| 68 | + sht1.write(i + 3, 0, i + 1) |
| 69 | + sht1.write(i + 3, 1, info_list[i][0]) |
| 70 | + sht1.write(i + 3, 2, info_list[i][1]) |
| 71 | + sht1.write(i + 3, 3, info_list[i][2]) |
| 72 | + xls2.close() |
| 73 | + |
| 74 | + |
| 75 | +def main(): |
| 76 | + first_url = 'https://tuanwei.nju.edu.cn/ggtz/list1.htm' |
| 77 | + url_prefix, url_suffix = first_url.split("1") |
| 78 | + html = grab_information(first_url) |
| 79 | + #先从第一个页面爬取总记录数,总页数, |
| 80 | + total_pages, total_records, records_per_page = parse_pcm(html) |
| 81 | + print('已爬取基本信息') |
| 82 | + info_list = parse(html, records_per_page) |
| 83 | + print('------------------已解析第1页内容--------------------') |
| 84 | + for i in range(2, total_pages): |
| 85 | + url = url_prefix + str(i) + url_suffix |
| 86 | + html = grab_information(url) |
| 87 | + info_list += parse(html, records_per_page) |
| 88 | + print('------------------已解析第' + str(i) + '页内容--------------------') |
| 89 | + url = url_prefix + str(total_pages) + url_suffix |
| 90 | + html = grab_information(url) |
| 91 | + info_list += parse(html, total_records - (total_pages - 1) * records_per_page) |
| 92 | + # 判断info_list中的条目数是否与总记录数相等,有问题就重新爬取 |
| 93 | + if len(info_list) != total_records: |
| 94 | + print('--------------------总数异常,将重新爬取内容-------------------------') |
| 95 | + main() |
| 96 | + else: |
| 97 | + print('------------------总数核对无误,正在将内容存入表格---------------------') |
| 98 | + storage(info_list) |
| 99 | + print('存储完成') |
| 100 | + |
| 101 | + |
| 102 | +if __name__ == '__main__': |
| 103 | + main() |
0 commit comments