-
Notifications
You must be signed in to change notification settings - Fork 2
/
spider.py
182 lines (160 loc) · 8.88 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import os
import re
import time
import json
from hashlib import md5
from concurrent import futures
from urllib.parse import urlencode
import pymongo
import requests
from logger import logger
# 连接MongoDB
client = pymongo.MongoClient(host='localhost', port=27017)
db = client.toutiao
collection = db.jiepai
# 设置图片下载后的保存基目录
basedir = os.path.abspath(os.path.dirname(__file__)) # 当前模块文件的根目录
down_path = os.path.join(basedir, 'downloads')
if not os.path.exists(down_path):
os.mkdir(down_path)
logger.info('Create base directory [%s]', down_path)
def get_albums(offset):
'''获取今日头条的街拍图集
:param offset: 获取多少个图集,默认是20个
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
params = {
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': 20, # 默认加载一页是20个图集
'cur_tab': 3, # '图集'页签。默认是1,是'综合'页签,不全是图集还有文章
'from': 'search_tab'
}
url = 'http://www.toutiao.com/search_content/?' + urlencode(params) # 用参数构造请求的URL
# 捕获request.get方法的异常,比如连接超时、被拒绝等
try:
resp = requests.get(url, headers=headers)
resp.raise_for_status()
except requests.exceptions.HTTPError as errh:
# In the event of the rare invalid HTTP response, Requests will raise an HTTPError exception (e.g. 401 Unauthorized)
logger.error('HTTP Error: %s', errh)
return
except requests.exceptions.ConnectionError as errc:
# In the event of a network problem (e.g. DNS failure, refused connection, etc)
logger.error('Connecting Error: %s', errc)
return
except requests.exceptions.Timeout as errt:
# If a request times out, a Timeout exception is raised. Maybe set up for a retry, or continue in a retry loop
logger.error('Timeout Error: %s', errt)
return
except requests.exceptions.TooManyRedirects as errr:
# If a request exceeds the configured number of maximum redirections, a TooManyRedirects exception is raised. Tell the user their URL was bad and try a different one
logger.error('Redirect Error: %s', errr)
return
except requests.exceptions.RequestException as err:
# catastrophic error. bail.
logger.error('Else Error: %s', err)
return
# 默认获取一页加载的20个图集:标题和URL
albums_list = []
dict_obj = resp.json() # 将响应的JSON数据转换为Python字典
if dict_obj.get('data'): # 默认resp中包含20个图集,存放在data字段下面
for album in dict_obj.get('data'):
# 偶尔有不是图集的需要排除,所以每页不一定是20个
if album.get('article_url') and re.match(r'http://toutiao\.com/group/\d+?/', album.get('article_url')):
album_title = album.get('title') # 每个图集的标题
album_url = album.get('article_url') # 每个图集的URL
album_date = album.get('datetime').split()[0] # 每个图集的发布日期,类似'2018-06-24'
album_author = album.get('media_name', '佚名') # 每个图集的发布者
album_dict = {
'album_title': album_title,
'album_url': album_url,
'album_date': album_date,
'album_author': album_author
}
albums_list.append(album_dict)
return albums_list
def get_images(album):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
}
# TODO: 捕获request.get方法的异常,比如连接超时、被拒绝等
resp = requests.get(album['album_url'], headers=headers)
# 图集下面所有的原始大图URL都放在JavsScript代码片段 gallay: JSON.parse() 下面
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
m = re.search(images_pattern, resp.text)
if m: # 如果正则匹配到,匹配结果存储在第1个分组中:m.group(1)
dict_obj = json.loads(m.group(1).replace('\\', '')) # 去除匹配结果中的\\,自己用chrome看gallay: JSON.parse()具体是什么
if dict_obj and 'sub_images' in dict_obj.keys():
sub_images = dict_obj.get('sub_images')
images = [item.get('url') for item in sub_images] # 列表生成式,真正的图片URL列表
data = {
'album_title': album['album_title'], # 图集标题
'album_url': album['album_url'], # 图集URL
'album_date': album['album_date'], # 图集发布日期
'album_author': album['album_author'], # 图集发布者
'images': images # 图集下面所有图片URL列表 图片URL类似于http://p1.pstatp.com/origin/pgc-image/15298463258670adeacc647
}
# 保存到MongoDB
if not collection.find_one(data):
if collection.insert_one(data):
logger.debug('Successfully saved gallery {} [{}] to MongoDB'.format(album['album_title'], album['album_url']))
else:
logger.debug('Error saving gallery {} [{}] to MongoDB'.format(album['album_title'], album['album_url']))
else: # 如果该条记录已存在,则不存储到数据库
logger.debug('Gallery {} [{}] has exist in MongoDB'.format(album['album_title'], album['album_url']))
# 按日期创建目录
date_dir = os.path.join(down_path, album['album_date'])
if not os.path.exists(date_dir):
os.mkdir(date_dir)
logger.info('Create date directory [%s]', date_dir)
# 为每个图集创建一个目录
album_dir = os.path.join(date_dir, '[{}P] '.format(len(images)) + re.sub('[\/:*?"<>|]', '_', album['album_title'])) # 注意要去除标题的非法字符
if not os.path.exists(album_dir):
os.mkdir(album_dir)
logger.info('Create gallery directory [%s]', album_dir)
# 如果图集目录下已保存的图片数等于images列表的长度,说明整个图集已下载过,跳过本次下载
if os.path.exists(album_dir) and len([img for img in os.listdir(album_dir) if os.path.isfile(os.path.join(album_dir, img))]) == len(images):
logger.info('All images of gallery {} [{}] has exist, ignore download'.format(album['album_title'], album['album_url']))
return
# 否则,依次下载每一张图片(同步阻塞)
for image in images:
# TODO: 捕获request.get方法的异常,比如连接超时、被拒绝等
resp = requests.get(image, headers=headers)
# 以图片内容的hash值当作文件名,可以保证不重复下载。这样以后重新运行下载脚本时,如果发现本地已经保存了同一图片就不下载
image_name = md5(resp.content).hexdigest() + '.jpg' # 因为图片没有后缀名,所以简单指定为.jpg。 如果有后缀,要用os.path.splitext切割获取后缀名
image_path = os.path.join(album_dir, image_name) # 图片的保存路径
if not os.path.exists(image_path):
with open(image_path, 'wb') as f:
f.write(resp.content)
else:
logger.info('Image [{}] of gallery {} [{}] has exist, ignore download'.format(image, album['album_title'], album['album_url']))
else:
logger.debug('Can not find images in gallery {} [{}]'.format(album['album_title'], album['album_url']))
def download_many():
'''多线程下载,每个线程下载一个图集'''
pages = 3
album_count = 0 # 统计一共下载了多少个图集,因为有的页会有不合规的会排除,并不是每页20个图集
for offset in [x * 20 for x in range(pages)]: # 排除一些不合规的图集URL,并不是每页20个图集
albums = get_albums(offset) # 返回包含1页中的所有图集列表
album_count += len(albums)
workers = 10
with futures.ThreadPoolExecutor(workers) as executor:
executor.map(get_images, albums)
return album_count
if __name__ == '__main__':
"""
t0 = time.time()
count = download_many()
msg = '{} albums downloaded in {} seconds.'
logger.info(msg.format(count, time.time() - t0))
"""
for offset in [x * 20 for x in range(3)]: # 排除一些不合规的图集URL,并不是每页20个图集
albums = get_albums(offset)
for album in albums:
print(album)
get_images(album)