-
Notifications
You must be signed in to change notification settings - Fork 130
/
getMcaGovData.js
189 lines (168 loc) · 8.05 KB
/
getMcaGovData.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
import cheerio from 'cheerio'
import http from 'https'
import path from 'path'
const fs = require('fs');
const Province = () => ({
code: '',
name: '',
children: []
})
const Area = () => ({
code: '',
name: '',
})
const City = () => ({
code: '',
name: '',
children: []
})
class GetMcaGovData {
sourceUrl = ''
headerClass = ''
cityClass = ''
constructor(sourceUrl, headerClass, cityClass) {
this.sourceUrl = sourceUrl
this.headerClass = headerClass
this.cityClass = cityClass
}
loadData = () => {
if (!this.sourceUrl) {
throw new Error('not set the url of parser !')
}
if (!this.headerClass || !this.cityClass) {
throw new Error('not set the city or header class of header !')
}
try {
http.get(this.sourceUrl, (res) => {
// 设置编码
res.setEncoding('utf8');
// 当接收到数据时,会触发 'data' 事件的执行
let html = "";
res.on('data', (data) => {
html += data;
});
// 数据接收完毕,会触发 'end' 事件的执行
res.on('end', () => {
const $ = cheerio.load(html);
// 去除里面的空格和空值
let elementsArea = $('.' + this.cityClass)
// 注意这里的filter用的是cheerio的filter不是es6的
elementsArea = elementsArea.filter((index, item) => $(item).text().trim())
let elementsProAndCity = $('.' + this.headerClass)
elementsProAndCity = elementsProAndCity.filter((index, item) => $(item).text().trim())
console.log('省市总计数量:' + elementsProAndCity.length / 2)
console.log('区总计数量:' + elementsArea.length / 2)
let total = (elementsArea.length + elementsProAndCity.length) / 2
console.log('省市区总计数量:' + total)
const listProvince = []
for(let i = 0; i <= elementsProAndCity.length; i += 2) {
const codeOrName = $(elementsProAndCity[i]).text().trim()
const next = $(elementsProAndCity[i + 1]).text().trim()
if (/\d/.test(codeOrName)) {
// 省份
if (codeOrName.endsWith('0000')) {
const province = new Province()
province.name = next
province.code = codeOrName
province.children = province.children || []
listProvince.push(province)
} else { // 市
const city = new City()
city.name = next
city.code = codeOrName
city.children = city.children || []
// 省份前缀
const prefixProvinceCode = codeOrName.substring(0, 2)
// 市区前缀
const prefixCityCode = codeOrName.substring(2, 4)
const provinceRegexp = new RegExp(`^${prefixProvinceCode}`)
// 市前缀匹配,加入到省份里面
const province = listProvince.find(item => {
return provinceRegexp.test(item.code)
})
province && province.children.push(city)
}
}
}
// 处理区和县
listProvince.forEach(item => {
// 省份前缀
const prefixProvinceCode = item.code.substring(0, 2)
const cityList = item.children
// 对于区,一个个处理,处理一个删除一个
do {
let codeOrName = $(elementsArea[0]).text().trim()
let next = $(elementsArea[1]).text().trim()
// 匹配省份
let regExp = new RegExp(`^${prefixProvinceCode}`)
if (/\d/.test(codeOrName)) {
if (regExp.test(codeOrName)) {
const area = new Area()
area.code = codeOrName
area.name = next
// 取区中间两位市的代号
const prefixCityCode = codeOrName.substring(2, 4)
regExp = new RegExp(`^${prefixProvinceCode}${prefixCityCode}`)
// 找出市,找到就加入到市里的下面的区
const currentCity = cityList.find(cityItem => regExp.test(cityItem.code) && cityItem.code.endsWith('00'))
if (cityList.length && currentCity) {
currentCity.children.push(area)
} else {
const exclude = ['北京市', '重庆市', '上海市', '天津市'];
// 解析直辖市下面的区和县
if (cityList.length === 0) {
const city = new City()
city.name = item.name
city.code = item.code
city.children.push(area)
cityList.push(city)
} else if (!exclude.includes(item.name)) {
cityList.push(area)
} else {
cityList[0].children.push(area)
}
}
elementsArea.splice(0, 2)
} else {
break
}
}
} while (elementsArea.length > 0)
})
let i = 0
listProvince.forEach(p => {
i++
p.children.forEach(c => {
i++
c.children && c.children.forEach(a => {
i++
})
})
})
// 多了4个直辖市
const parseTotal = i - 4
console.log('解析完成总计数量:' + parseTotal, total)
console.log('解析数量是否相等:' + (parseTotal === total ? '相等' : '不相等'))
if (parseTotal === total) {
fs.writeFile(path.join(__dirname, 'provinceList.json'), JSON.stringify(listProvince), function(err) {
if (err)
return;
console.log('导出成功')
});
} else {
throw new Error('解析前后数量不相等,解析失败!')
}
})
});
} catch (e) {
throw new Error('parse with error !')
}
}
}
// headerClass和cityClass在统计局的官网查看css的class
const data = new GetMcaGovData(
'https://www.mca.gov.cn/article/sj/xzqh/2020/2020/2020112010001.html',
'xl7014987',
'xl7114987'
)
data.loadData()