1
- import os
2
1
import requests
3
2
from urllib .parse import urlencode
3
+ from requests import codes
4
+ import os
4
5
from hashlib import md5
5
6
from multiprocessing .pool import Pool
6
7
7
- GROUP_START = 1
8
- GROUP_END = 5
9
-
10
8
11
9
def get_page (offset ):
12
10
params = {
@@ -15,50 +13,52 @@ def get_page(offset):
15
13
'keyword' : '街拍' ,
16
14
'autoload' : 'true' ,
17
15
'count' : '20' ,
18
- 'cur_tab' : '3 ' ,
19
- 'from' : 'gallery' ,
16
+ 'cur_tab' : '1 ' ,
17
+ 'from' : 'search_tab'
20
18
}
21
- url = 'https://www.toutiao.com/search_content/?' + urlencode (params )
19
+ base_url = 'https://www.toutiao.com/search_content/?'
20
+ url = base_url + urlencode (params )
22
21
try :
23
- response = requests .get (url )
24
- if response . status_code == 200 :
25
- return response .json ()
22
+ resp = requests .get (url )
23
+ if codes . ok == resp . status_code :
24
+ return resp .json ()
26
25
except requests .ConnectionError :
27
26
return None
28
27
29
28
30
29
def get_images (json ):
31
- data = json .get ('data' )
32
- if data :
30
+ if json .get ('data' ):
31
+ data = json . get ( 'data' )
33
32
for item in data :
34
- # print( item)
35
- image_list = item . get ( 'image_list' )
33
+ if item . get ( 'cell_type' ) is not None :
34
+ continue
36
35
title = item .get ('title' )
37
- # print(image_list)
38
- if image_list :
39
- for image in image_list :
40
- yield {
41
- 'image' : image .get ('url' ),
42
- 'title' : title
43
- }
36
+ images = item .get ('image_list' )
37
+ for image in images :
38
+ yield {
39
+ 'image' : 'https:' + image .get ('url' ),
40
+ 'title' : title
41
+ }
44
42
45
43
46
44
def save_image (item ):
47
- if not os .path .exists (item .get ('title' )):
48
- os .mkdir (item .get ('title' ))
45
+ img_path = 'img' + os .path .sep + item .get ('title' )
46
+ if not os .path .exists (img_path ):
47
+ os .makedirs (img_path )
49
48
try :
50
- local_image_url = item .get ('image' )
51
- new_image_url = local_image_url . replace ( 'list' , 'large' )
52
- response = requests . get ( 'http:' + new_image_url )
53
- if response . status_code == 200 :
54
- file_path = '{0}/{1}.{2}' . format ( item . get ( 'title' ), md5 ( response . content ). hexdigest (), 'jpg' )
49
+ resp = requests . get ( item .get ('image' ) )
50
+ if codes . ok == resp . status_code :
51
+ file_path = img_path + os . path . sep + '{file_name}.{file_suffix}' . format (
52
+ file_name = md5 ( resp . content ). hexdigest (),
53
+ file_suffix = 'jpg' )
55
54
if not os .path .exists (file_path ):
56
- with open (file_path , 'wb' )as f :
57
- f .write (response .content )
55
+ with open (file_path , 'wb' ) as f :
56
+ f .write (resp .content )
57
+ print ('Downloaded image path is %s' % file_path )
58
58
else :
59
59
print ('Already Downloaded' , file_path )
60
60
except requests .ConnectionError :
61
- print ('Failed to save image' )
61
+ print ('Failed to Save Image,item %s' % item )
62
62
63
63
64
64
def main (offset ):
@@ -68,6 +68,9 @@ def main(offset):
68
68
save_image (item )
69
69
70
70
71
+ GROUP_START = 0
72
+ GROUP_END = 7
73
+
71
74
if __name__ == '__main__' :
72
75
pool = Pool ()
73
76
groups = ([x * 20 for x in range (GROUP_START , GROUP_END + 1 )])
0 commit comments