-
Notifications
You must be signed in to change notification settings - Fork 2
/
crawler_city_scale.py
155 lines (131 loc) · 4.39 KB
/
crawler_city_scale.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
def read_counties(file_path):
"""读取县级市集合的CSV文件"""
return pd.read_csv(file_path)["城市名字"].tolist()
def fetch_province_page(province, base_url):
"""获取省份页面内容"""
full_url = f"{base_url}{province}/admin"
try:
response = requests.get(full_url)
response.raise_for_status()
return response.content
except requests.RequestException as e:
print(f"请求 {full_url} 失败: {e}")
return None
def extract_county_data(soup, counties_need, matched_cities, base_url):
"""提取县级市数据"""
data = []
table = soup.find("table", {"class": "data"})
if not table:
return data
for row in table.find_all("tr"):
links = row.find_all("a", href=True)
for county in counties_need:
if county in matched_cities:
continue
for link in links:
if (
county in str(link)
and "admin" in str(link)
and "china" in str(link)
):
print(f"{county} 匹配成功")
final_url = f"https://www.citypopulation.de{link.get('href')}"
print(f"最终网址是 {final_url}")
county_data = fetch_county_data(final_url, county)
if county_data:
data.append(county_data)
matched_cities.add(county)
print(f"{county} 已添加至 matched_cities")
break
return data
def fetch_county_data(url, county):
"""获取县级市详细数据"""
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException as e:
print(f"请求 {url} 失败: {e}")
return None
soup = BeautifulSoup(response.content, "html.parser")
try:
soup_str = str(
re.search(
r"var addChartData = (.*?);</script>", str(soup), re.DOTALL
).group(1)
)
popularity_1 = re.search(r'"城镇人口",(\d+)', soup_str).group(1)
popularity_2 = re.search(r'"乡村人口",(\d+)', soup_str).group(1)
popularity = int(popularity_1) + int(popularity_2)
print(f"{county} 的总人口为 {popularity}")
except AttributeError:
print(f"未能找到 {county} 的人口数据,跳过")
return None
td_with_data_area = soup.find("td", {"class": "rname"})
if td_with_data_area:
data_area = td_with_data_area.get("data-area")
data_density = td_with_data_area.get("data-density")
else:
data_area = None
data_density = None
print(f"{county} 的面积为 {data_area}")
print(f"{county} 的人口密度为 {data_density}")
return [county, popularity, data_area, data_density]
def save_data_to_csv(data, file_path):
"""将数据保存到CSV文件"""
df = pd.DataFrame(data, columns=["县级市", "总人口", "面积", "人口密度"])
df.to_csv(file_path, index=False)
print(f"数据已保存到 {file_path}")
def main():
county_csv = "counties.csv"
output_csv = "county_data_city_scale.csv"
base_url = "https://www.citypopulation.de/zh/china/"
provinces = [
"anhui",
"beijing",
"chongqing",
"fujian",
"gansu",
"guangdong",
"guangxi",
"guizhou",
"hainan",
"hebei",
"heilongjiang",
"henan",
"hubei",
"hunan",
"jiangsu",
"jiangxi",
"jilin",
"liaoning",
"neimenggu",
"ningxia",
"qinghai",
"shandong",
"shanghai",
"shanxi",
"sichuan",
"tianjin",
"xinjiang",
"xizang",
"yunnan",
"zhejiang",
]
counties_need = read_counties(county_csv)
matched_cities = set()
all_data = []
for province in provinces:
page_content = fetch_province_page(province, base_url)
if page_content:
soup = BeautifulSoup(page_content, "html.parser")
province_data = extract_county_data(
soup, counties_need, matched_cities, base_url
)
all_data.extend(province_data)
save_data_to_csv(all_data, output_csv)
if __name__ == "__main__":
main()