Skip to content

Commit 74e8d69

Browse files
authored
Update spider.py
1 parent 817a67f commit 74e8d69

File tree

1 file changed

+34
-31
lines changed

1 file changed

+34
-31
lines changed

spider.py

Lines changed: 34 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import os
21
import requests
32
from urllib.parse import urlencode
3+
from requests import codes
4+
import os
45
from hashlib import md5
56
from multiprocessing.pool import Pool
67

7-
GROUP_START = 1
8-
GROUP_END = 5
9-
108

119
def get_page(offset):
1210
params = {
@@ -15,50 +13,52 @@ def get_page(offset):
1513
'keyword': '街拍',
1614
'autoload': 'true',
1715
'count': '20',
18-
'cur_tab': '3',
19-
'from': 'gallery',
16+
'cur_tab': '1',
17+
'from': 'search_tab'
2018
}
21-
url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
19+
base_url = 'https://www.toutiao.com/search_content/?'
20+
url = base_url + urlencode(params)
2221
try:
23-
response = requests.get(url)
24-
if response.status_code == 200:
25-
return response.json()
22+
resp = requests.get(url)
23+
if codes.ok == resp.status_code:
24+
return resp.json()
2625
except requests.ConnectionError:
2726
return None
2827

2928

3029
def get_images(json):
31-
data = json.get('data')
32-
if data:
30+
if json.get('data'):
31+
data = json.get('data')
3332
for item in data:
34-
# print(item)
35-
image_list = item.get('image_list')
33+
if item.get('cell_type') is not None:
34+
continue
3635
title = item.get('title')
37-
# print(image_list)
38-
if image_list:
39-
for image in image_list:
40-
yield {
41-
'image': image.get('url'),
42-
'title': title
43-
}
36+
images = item.get('image_list')
37+
for image in images:
38+
yield {
39+
'image': 'https:' + image.get('url'),
40+
'title': title
41+
}
4442

4543

4644
def save_image(item):
47-
if not os.path.exists(item.get('title')):
48-
os.mkdir(item.get('title'))
45+
img_path = 'img' + os.path.sep + item.get('title')
46+
if not os.path.exists(img_path):
47+
os.makedirs(img_path)
4948
try:
50-
local_image_url = item.get('image')
51-
new_image_url = local_image_url.replace('list','large')
52-
response = requests.get('http:' + new_image_url)
53-
if response.status_code == 200:
54-
file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
49+
resp = requests.get(item.get('image'))
50+
if codes.ok == resp.status_code:
51+
file_path = img_path + os.path.sep + '{file_name}.{file_suffix}'.format(
52+
file_name=md5(resp.content).hexdigest(),
53+
file_suffix='jpg')
5554
if not os.path.exists(file_path):
56-
with open(file_path, 'wb')as f:
57-
f.write(response.content)
55+
with open(file_path, 'wb') as f:
56+
f.write(resp.content)
57+
print('Downloaded image path is %s' % file_path)
5858
else:
5959
print('Already Downloaded', file_path)
6060
except requests.ConnectionError:
61-
print('Failed to save image')
61+
print('Failed to Save Image,item %s' % item)
6262

6363

6464
def main(offset):
@@ -68,6 +68,9 @@ def main(offset):
6868
save_image(item)
6969

7070

71+
GROUP_START = 0
72+
GROUP_END = 7
73+
7174
if __name__ == '__main__':
7275
pool = Pool()
7376
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])

0 commit comments

Comments
 (0)