-
Notifications
You must be signed in to change notification settings - Fork 179
/
collect_data.py
172 lines (157 loc) · 6.34 KB
/
collect_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#coding = utf-8
import os
from urllib import request
from lxml import etree
import gzip
import pymongo
import datetime
class NewspaperSpider:
def __init__(self):
self.term_dict = {
'aircraft': "飞行器",
'warship': "舰船舰艇",
'guns': "枪械与单兵",
'tank': "坦克装甲车辆",
'artillery': "火炮",
'missile': "导弹武器",
'spaceship': "太空装备",
'explosive': "爆炸物",
}
self.conn = pymongo.MongoClient()
return
'''get html '''
def get_html(self, url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_1fc983b4c305d209e7e05d96e713939f=1552034977; Hm_lpvt_1fc983b4c305d209e7e05d96e713939f=1552036141',
'Host':'weapon.huanqiu.com'
}
req = request.Request(url, headers=headers)
page = request.urlopen(req).read()
page = gzip.decompress(page).decode('utf-8')
return page
'''get_urllist'''
def get_urllist(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
papers = ['http://weapon.huanqiu.com' + i for i in selector.xpath('//li/span[@class="pic"]/a/@href')]
return list(set(papers))
'''content parser'''
def html_parser(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
attrs =selector.xpath('//div[@class="dataInfo"]/ul/li')
contents = [html, title]
for article in attrs:
content = article.xpath('string(.)')
contents.append(content)
return contents
'''modify data'''
def modify_data(self):
keys = []
for item in self.conn['military']['kb'].find():
body = item['contents']
title = body[1].replace(' ','').replace('-','-').replace('(','(').replace(')',')')
title = title.split('_')
data = {}
name = title[0]
category = title[1]
data['名称'] = name
data['类别'] = category
attrs = body[2:]
html = body[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
data['产国'] = country
for attr in attrs:
if len(attr.split(':')) < 2:
continue
key = attr.split(':')[0].replace('(','(').replace(' ','').replace('\t','')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split(':')[1]
data[key] = value.replace('\t','').replace('\n','').replace(',','')
keys.append(key)
self.conn['military']['graph_data'].insert(data)
return
'''采集主函数'''
def spider_main(self):
big_cates = ['aircraft', 'warship',
'guns', 'tank',
'artillery', 'missile',
'spaceship', 'explosive'
]
for big_cate in big_cates:
big_url = 'http://weapon.huanqiu.com/weaponlist/%s'%big_cate
html = self.get_html(big_url)
selector = etree.HTML(html)
span = selector.xpath('//span[@class="list"]')[0]
second_urls = ['http://weapon.huanqiu.com' + i for i in span.xpath('./a/@href')]
second_cates = [i for i in span.xpath('./a/text()')]
second_dict = {}
for indx, second_cate in enumerate(second_cates):
second_dict[second_cate] = second_urls[indx]
for second_cate, second_url in second_dict.items():
max_pages = self.get_maxpage(second_url)
for page in range(1, max_pages+1):
url = second_url + '_0_0_%s'%page
seed_urls = self.get_urllist(url)
for seed in seed_urls:
self.get_info(seed, big_cate, second_cate)
'''根据最大值,获取所有信息'''
def get_info(self, url, big_cate, second_cate):
content = self.html_parser(url)
data = self.extract_data(content)
data['大类'] = self.term_dict.get(big_cate)
data['类型'] = second_cate
if data:
print(data)
self.conn['military']['knowledge_base'].insert(data)
return
'''modify data'''
def extract_data(self, content):
title = content[1].replace(' ', '').replace('-', '-').replace('(', '(').replace(')', ')')
title = title.split('_')
data = {}
name = title[0]
data['名称'] = name
attrs = content[2:]
html = content[0]
selector = etree.HTML(html)
country = selector.xpath('//span[@class="country"]/b/a/text()')[0]
image = selector.xpath('//div[@class="maxPic"]/img/@src')
if not image:
image = ''
else:
image = image[0]
data['产国'] = country
data['图片'] = image
data['简介'] = ''.join(selector.xpath('//div[@class="module"]/p/text()')).replace('\xa0','').replace('\u3000', '').replace('\t', '')
for attr in attrs:
if len(attr.split(':')) < 2:
continue
key = attr.split(':')[0].replace('(', '(').replace(' ', '').replace('\t', '')
if key.startswith('(') or len(key) > 6:
continue
value = attr.split(':')[1]
data[key] = value.replace('\t', '').replace('\n', '').replace(',', '')
return data
'''获取最大值'''
def get_maxpage(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
max_pages = selector.xpath('//div[@class="pages"]/a/text()')
if not max_pages:
max_page = 1
else:
max_page = int(max_pages[-2])
return max_page
if __name__ == '__main__':
handler = NewspaperSpider()
handler.spider_main()