Skip to content

Commit 226fc99

Browse files
authored
花瓣网
1 parent 7eed6cf commit 226fc99

File tree

1 file changed

+108
-0
lines changed

1 file changed

+108
-0
lines changed

huaban.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# -*- coding: utf-8 -*-
2+
3+
'''
4+
python 2.7.12
5+
'''
6+
7+
import requests
8+
from parsel import Selector
9+
import time
10+
import re, random, os
11+
12+
13+
def scraw_pin_ids():
14+
15+
pin_ids = []
16+
pin_id = '1068018182'
17+
18+
flag = True
19+
while flag:
20+
try:
21+
url = "http://huaban.com/favorite/beauty/"
22+
headers1 = {
23+
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
24+
'Accept':'application/json',
25+
'X-Request':'JSON',
26+
'X-Requested-With':'XMLHttpRequest',
27+
}
28+
29+
params = {
30+
'j0l4lymf':'',
31+
'max':pin_id,
32+
'limit':'20',
33+
'wfl':'1',
34+
}
35+
36+
z1 = requests.get(url, params=params, headers=headers1)
37+
38+
if z1.json()['pins']:
39+
for i in z1.json()['pins']:
40+
pin_ids.append(i['pin_id'])
41+
pin_id = pin_ids[-1]
42+
print i['pin_id']
43+
# with open("pin_ids.txt",'ab') as f:
44+
# f.write(str(i['pin_id'])+"\n")
45+
# f.close()
46+
time.sleep(0.001)
47+
else:
48+
flag = False
49+
return set(pin_ids)
50+
except:
51+
continue
52+
53+
def scraw_urls(pin_ids):
54+
55+
urls = []
56+
57+
urlss = ['http://huaban.com/pins/' + str(i) +'/' for i in pin_ids]
58+
for url in urlss:
59+
try:
60+
headers = {
61+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
62+
}
63+
64+
z3 = requests.get(url, headers=headers)
65+
66+
text = z3.text
67+
68+
pattern = re.compile('"key":"(.*?)"', re.S)
69+
items = re.findall(pattern, text)
70+
71+
urls.extend(items)
72+
print items
73+
print '============================================================================================================'
74+
except:
75+
continue
76+
return set(urls)
77+
78+
def download(urls):
79+
headers1 = {
80+
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
81+
}
82+
n = 1
83+
urls = set(urls)
84+
for url in urls:
85+
try:
86+
if not os.path.exists(os.path.join(file_path, "huaban")):
87+
os.makedirs(os.path.join(file_path, "huaban"))
88+
os.chdir(file_path + '\\' + "huaban")
89+
try:
90+
url = 'http://img.hb.aicdn.com/' + url
91+
r = requests.get(url, headers=headers1)
92+
if len(r.content)>40000:
93+
with open(str(n)+".jpg", 'wb') as f:
94+
f.write(r.content)
95+
f.close()
96+
print u"第" + str(n) + u"张图片下载成功"
97+
n+=1
98+
# time.sleep(3)
99+
except:
100+
continue
101+
except:
102+
continue
103+
104+
# 图片存储路径
105+
file_path = 'E:\selfprogress\programming\project\pa1024\huabannnnnnn'
106+
pin_ids = scraw_pin_ids()
107+
urls = scraw_urls(pin_ids)
108+
download(urls)

0 commit comments

Comments
 (0)