AQIstudySpiderWithSelenium/AQIstudySpiderWithSelenium.py at master · lcl1995225/AQIstudySpiderWithSelenium · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
# @Time    : 2018-08-07 13:58
# @Author  : Lcl
# @Mail    : 576884674@qq.com

import pandas as pd
import time
import os
from selenium import webdriver

## 创建存放爬取的临时文件的文件夹
if not os.path.exists('result'):
    os.mkdir('result')

## 组装日期, return list (201312-201808)
year = 2013
yearMonthList = []
while year <= 2018:
    month = 1
    while month <= 12:
        monthStr = '0' + str(month) if month <= 9 else str(month)
        yearStr = str(year)
        yearMonthList.append(yearStr + monthStr)
        month += 1
    year += 1

# 删除多余月份
i = 0
while i <= 10:
    yearMonthList.pop(0)
    if i <= 3:
        yearMonthList.pop(-1)
    i += 1

df = pd.read_excel('city.xlsx', header=0)
# city这台计算机需要处理的城市，因为使用selenium爬取速度很慢，所以分到4台机器上运行。
# 若要几台电脑同时运行：city = df[df['machine'] == 'pc1']['city']，下次考虑直接用远程数据库写分布式
city = df['city']

## 组装url ,return urlList
cityUrlDict = {}

for eachCity in city:
    cityUrlDict[eachCity] = {}
    for eachDate in yearMonthList:
        cityUrlDict[eachCity][eachDate] = 'https://www.aqistudy.cn/historydata/daydata.php?city={}&month={}'.format(eachCity, eachDate)

df = pd.DataFrame(cityUrlDict)

## 使用selenium模拟浏览器行为
# 加启动配置，令chrome后台运行
option = webdriver.ChromeOptions()
option.add_argument('headless')
browser = webdriver.Chrome('chromedriver.exe',chrome_options=option)

# 打开任意首页并停留，之所以有多余的首页，是为了关闭标签时不完全退出浏览器。
browser.get('https://www.baidu.com/')
# 获得主页的句柄，便于后面切换。
homePage = browser.current_window_handle

# 这里如果太快了容易崩溃，可以用一个字段去标记、分发。
for i,eachCity in enumerate(city):
    # 打开新页面
    eachCityUrl = df.ix[:, eachCity]
    for j,url in enumerate(eachCityUrl):
        browser.switch_to.window(homePage)
        # 在界面中运行js，在新标签页中打开界面
        newUrl = 'window.open("{}")'.format(url)
        browser.execute_script(newUrl)
        browser.switch_to.window(browser.window_handles[-1])
        title = browser.title
        # 若发生错误，刷新后再重试一次，实际上不是个好方法。
        try:
            time.sleep(2)
            # 这个js是通过分析前端源码得出的，在console中调试过console.log(items)，实际上不需要这么复杂。
            result = browser.execute_script("return items")
            browser.close()
        except:
            browser.refresh()
            time.sleep(5)
            result = browser.execute_script("return items")
            browser.close()
        # 如果获得的items不为空
        if len(result) > 0:
            for each in result:
                each['city'] = eachCity
            df2 = pd.DataFrame(result)
            df2.to_csv('result//' + each['city'] + each['time_point'][0:7] + '.csv', index_label='index')
        else:
            l = ['empty_flag']
            df2 = pd.DataFrame(l)
            df2.to_csv('result//' + eachCity +  url[-6:-2]+'-'+url[-2:] +'_empty' + '.csv',
                      index_label='index')

        print("该城市进度{}/{}，城市序号{}/{}，当前页面标题：{}".format(j+1,len(eachCityUrl),i + 1, len(city),title))
browser.quit()
print('done，开始合并文件')

files = os.listdir('result')
l_clean = []
for each in files:
    if not 'empty' in each:
        l_clean.append(each)

for i,each in enumerate(l_clean):
    maxNum = len(l_clean)
    if i == 0:
        path = 'result/'+each
        f = open(path,'r',encoding='utf-8')
        df = pd.read_csv(f,index_col='index')
    else:
        path = 'result/'+each
        print(i+1, '/', maxNum,'----',each)
        f = open(path,'r',encoding='utf-8')
        df2 = pd.read_csv(f,index_col='index')
        df = df.append(df2)
df2 = df.sort_values(by=["city","time_point"], ascending=[True, True]).reset_index(drop=True)
df2.to_csv('_2013-12_2018_08.csv', index_label='index')
print('合并文件完成')