Skip to content

Commit 07c1113

Browse files
committed
Merge branch 'master' of https://github.com/injetlee/Python
2 parents 6ae2297 + 9176b0a commit 07c1113

File tree

3 files changed

+278
-18
lines changed

3 files changed

+278
-18
lines changed

biyingSpider.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,11 @@
55
url = 'http://cn.bing.com/'
66
con = requests.get(url)
77
content = con.text
8-
reg = r"(http://s.cn.bing.net/az/hprichbg/rb/.*?.jpg)"
8+
reg = r"(az/hprichbg/rb/.*?.jpg)"
99
a = re.findall(reg, content, re.S)[0]
1010
print(a)
11-
read = requests.get(a)
11+
picUrl = url + a
12+
read = requests.get(picUrl)
1213
f = open('%s.jpg' % local, 'wb')
1314
f.write(read.content)
1415
f.close()

image_recognition_zhihu.py

+202
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
# -*- coding:UTF-8 -*-
2+
3+
import requests , time ,random
4+
import hmac ,json ,base64
5+
from bs4 import BeautifulSoup
6+
from hashlib import sha1
7+
import TencentYoutuyun
8+
from PIL import Image
9+
import uuid
10+
11+
12+
13+
def recognition_captcha(data):
14+
''' 识别验证码 '''
15+
16+
file_id = str(uuid.uuid1())
17+
filename = 'captcha_'+ file_id +'.gif'
18+
filename_png = 'captcha_'+ file_id +'.png'
19+
20+
if(data is None):
21+
return
22+
data = base64.b64decode(data.encode('utf-8'))
23+
with open( filename ,'wb') as fb:
24+
fb.write( data )
25+
26+
appid = 'appid' # 接入优图服务,注册账号获取
27+
secret_id = 'secret_id'
28+
secret_key = 'secret_key'
29+
userid= 'userid'
30+
end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT
31+
32+
youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化
33+
34+
# 拿到的是gif格式,而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式
35+
im = Image.open( filename)
36+
im.save( filename_png ,"png")
37+
im.close()
38+
39+
result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径,1代表url
40+
41+
return result
42+
43+
44+
def get_captcha(sessiona,headers):
45+
''' 获取验证码 '''
46+
47+
need_cap = False
48+
49+
while( need_cap is not True):
50+
try:
51+
sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf
52+
resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket
53+
need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码
54+
time.sleep( 0.5 + random.randint(1,9)/10 )
55+
except Exception:
56+
continue
57+
58+
try:
59+
resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据,注意是put
60+
img_data = json.loads(resp3.text)["img_base64"]
61+
except Exception:
62+
return
63+
64+
65+
return img_data
66+
67+
def create_point( point_data, confidence ):
68+
''' 获得点阵 '''
69+
70+
# 实际操作下,套路不深,x间隔25,y相同,共7个点 ,先模拟意思一下
71+
points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]}
72+
wi = 0
73+
input_points = []
74+
75+
for word in ( point_data['items'][0]['words'] ):
76+
wi = wi+1
77+
if( word['confidence'] < confidence ):
78+
try:
79+
input_points.append(points[wi]) # 倒置的中文,优图识别不出来,置信度会低于0.5
80+
except KeyError:
81+
continue
82+
83+
if( len(input_points) > 2 or len(input_points) == 0 ):
84+
return [] # 7个字中只有2个倒置中文的成功率高
85+
86+
result = {}
87+
result['img_size']=[200,44]
88+
result['input_points']=input_points
89+
result = json.dumps(result)
90+
print(result)
91+
return result
92+
93+
def bolting(k_low,k_hi,k3_confidence):
94+
''' 筛选把握大的进行验证 '''
95+
96+
start = time.time()
97+
98+
is_success = False
99+
while(is_success is not True):
100+
101+
points_len = 1
102+
angle = -20
103+
img_ko = []
104+
105+
while(points_len != 21 or angle < k_low or angle > k_hi ):
106+
img_data = get_captcha(sessiona,headers)
107+
img_ko = recognition_captcha(img_data)
108+
109+
## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
110+
# img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False )
111+
# img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因,也因为优图自身的原因,此处要特殊处理
112+
113+
# with open( "json.txt" ,'wb') as fb:
114+
# fb.write( img_ko_json )
115+
116+
try:
117+
points_len = len(img_ko['items'][0]['itemstring'])
118+
angle = img_ko['angle']
119+
except Exception:
120+
points_len = 1
121+
angle = -20
122+
continue
123+
124+
# print(img_ko_json.decode('utf8')) ## stdout用的是utf8,需转码才能正常显示
125+
# print('-'*50)
126+
127+
input_text = create_point( img_ko ,k3_confidence )
128+
if(type(input_text) == type([])):
129+
continue
130+
131+
data = {
132+
"input_text":input_text
133+
}
134+
135+
# 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ,假装思考5秒钟
136+
time.sleep( 4 + random.randint(1,9)/10 )
137+
try:
138+
resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers)
139+
except Exception:
140+
continue
141+
142+
print("angle: "+ str(angle) )
143+
print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心
144+
print('-'*50)
145+
try:
146+
is_success = json.loads(resp5.text)["success"]
147+
except KeyError:
148+
continue
149+
150+
end = time.time()
151+
152+
return end-start
153+
154+
155+
if __name__ == "__main__":
156+
157+
sessiona = requests.Session()
158+
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
159+
160+
k3_confidence = 0.71
161+
162+
'''
163+
# 可视化数据会被保存在云端供浏览
164+
# https://plot.ly/~weldon2010/4
165+
# 纯属学习,并未看出"角度"范围扩大对图像识别的影响,大部分时候60s内能搞定,说明优图还是很强悍的,识别速度也非常快
166+
'''
167+
runtime_list_x = []
168+
runtime_list_y = []
169+
nn = range(1,11) # 愿意的话搞多线程,1百万次更有意思
170+
171+
# 成功尝试100次,形成2维数据以热力图的方式展示
172+
for y in nn :
173+
for x in nn :
174+
runtime_list_x.append( bolting(-3,3,k3_confidence) )
175+
print( "y: " + str(runtime_list_y) )
176+
print( "x: " + str(runtime_list_x) )
177+
runtime_list_y.append(runtime_list_x.copy())
178+
runtime_list_x = []
179+
180+
print ("-"*30)
181+
print( runtime_list_y )
182+
print ("-"*30)
183+
184+
# pip install plotly 数据可视化
185+
import plotly
186+
import plotly.graph_objs as go
187+
plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册
188+
trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ])
189+
data=[trace]
190+
plotly.plotly.plot(data, filename='weldon-time2-heatmap')
191+
192+
# 尝试后发现一个特点,基本都是1~2个倒置中文,这样我们可以借此提速
193+
# 角度范围放大,仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找
194+
195+
### chcp 65001 (win下改变cmd字符集)
196+
### python c:\python34\image_recognition_zhihu.py
197+
198+
199+
200+
201+
202+

login_zhihu.py

+73-16
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,84 @@
1-
import requests,time
1+
# -*- coding:UTF-8 -*-
2+
3+
import requests , time
4+
import hmac ,json
25
from bs4 import BeautifulSoup
3-
url = 'https://www.zhihu.com/login/email'
4-
def get_captcha(data):
6+
from hashlib import sha1
7+
8+
9+
def get_captcha(data,need_cap):
10+
''' 处理验证码 '''
11+
if need_cap is False:
12+
return
513
with open('captcha.gif','wb') as fb:
614
fb.write(data)
7-
return input('captcha')
15+
return input('captcha:')
16+
17+
def get_signature(grantType,clientId,source,timestamp):
18+
''' 处理签名 '''
19+
20+
hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1)
21+
hm.update(str.encode(grantType))
22+
hm.update(str.encode(clientId))
23+
hm.update(str.encode(source))
24+
hm.update(str.encode(timestamp))
825

9-
def login(username,password,oncaptcha):
10-
sessiona = requests.Session()
11-
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}
12-
xyz = sessiona.get('https://www.zhihu.com/#signin',headers=headers).content
13-
_xsrf = BeautifulSoup(sessiona.get('https://www.zhihu.com/#signin',headers=headers).content,'html.parser').find('input',attrs={'name':'_xsrf'}).get('value')
26+
return str(hm.hexdigest())
27+
28+
29+
30+
def login(username,password,oncaptcha,sessiona,headers):
31+
''' 处理登录 '''
32+
33+
resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf
34+
resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket
35+
need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码
36+
37+
grantType = 'password'
38+
clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
39+
source ='com.zhihu.web'
40+
timestamp = str((time.time()*1000)).split('.')[0] # 签名只按这个时间戳变化
41+
1442
captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content
43+
1544
data = {
16-
"_xsrf":_xsrf,
17-
"email":username,
45+
"client_id":clientId,
46+
"grant_type":grantType,
47+
"timestamp":timestamp,
48+
"source":source,
49+
"signature": get_signature(grantType,clientId,source,timestamp), # 获取签名
50+
"username":username,
1851
"password":password,
19-
"remember_me":True,
20-
"captcha":oncaptcha(captcha_content)
52+
"lang":"cn",
53+
"captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码
54+
"ref_source":"other_",
55+
"utm_source":""
2156
}
22-
resp = sessiona.post('https://www.zhihu.com/login/email',data,headers=headers).content
23-
print(resp)
57+
58+
print("**2**: "+str(data))
59+
print("-"*50)
60+
resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content
61+
print(BeautifulSoup(resp,'html.parser'))
62+
63+
print("-"*50)
2464
return resp
2565

2666
if __name__ == "__main__":
27-
login('email','password',get_captcha)
67+
sessiona = requests.Session()
68+
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
69+
70+
login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了
71+
resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers) # 登录进去了,可以看私信了
72+
print(BeautifulSoup(resp.content ,'html.parser'))
73+
74+
75+
76+
77+
### chcp 65001 (win下改变cmd字符集)
78+
### python c:\python34\login_zhihu.py
79+
### 有非常无语的事情发生,还以为代码没生效
80+
81+
82+
83+
84+

0 commit comments

Comments
 (0)