|
| 1 | +# -*- coding:UTF-8 -*- |
| 2 | + |
| 3 | +import requests , time ,random |
| 4 | +import hmac ,json ,base64 |
| 5 | +from bs4 import BeautifulSoup |
| 6 | +from hashlib import sha1 |
| 7 | +import TencentYoutuyun |
| 8 | +from PIL import Image |
| 9 | +import uuid |
| 10 | + |
| 11 | + |
| 12 | + |
| 13 | +def recognition_captcha(data): |
| 14 | + ''' 识别验证码 ''' |
| 15 | + |
| 16 | + file_id = str(uuid.uuid1()) |
| 17 | + filename = 'captcha_'+ file_id +'.gif' |
| 18 | + filename_png = 'captcha_'+ file_id +'.png' |
| 19 | + |
| 20 | + if(data is None): |
| 21 | + return |
| 22 | + data = base64.b64decode(data.encode('utf-8')) |
| 23 | + with open( filename ,'wb') as fb: |
| 24 | + fb.write( data ) |
| 25 | + |
| 26 | + appid = 'appid' # 接入优图服务,注册账号获取 |
| 27 | + secret_id = 'secret_id' |
| 28 | + secret_key = 'secret_key' |
| 29 | + userid= 'userid' |
| 30 | + end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT |
| 31 | + |
| 32 | + youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化 |
| 33 | + |
| 34 | + # 拿到的是gif格式,而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式 |
| 35 | + im = Image.open( filename) |
| 36 | + im.save( filename_png ,"png") |
| 37 | + im.close() |
| 38 | + |
| 39 | + result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径,1代表url |
| 40 | + |
| 41 | + return result |
| 42 | + |
| 43 | + |
| 44 | +def get_captcha(sessiona,headers): |
| 45 | + ''' 获取验证码 ''' |
| 46 | + |
| 47 | + need_cap = False |
| 48 | + |
| 49 | + while( need_cap is not True): |
| 50 | + try: |
| 51 | + sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf |
| 52 | + resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket |
| 53 | + need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 |
| 54 | + time.sleep( 0.5 + random.randint(1,9)/10 ) |
| 55 | + except Exception: |
| 56 | + continue |
| 57 | + |
| 58 | + try: |
| 59 | + resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据,注意是put |
| 60 | + img_data = json.loads(resp3.text)["img_base64"] |
| 61 | + except Exception: |
| 62 | + return |
| 63 | + |
| 64 | + |
| 65 | + return img_data |
| 66 | + |
| 67 | +def create_point( point_data, confidence ): |
| 68 | + ''' 获得点阵 ''' |
| 69 | + |
| 70 | + # 实际操作下,套路不深,x间隔25,y相同,共7个点 ,先模拟意思一下 |
| 71 | + points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]} |
| 72 | + wi = 0 |
| 73 | + input_points = [] |
| 74 | + |
| 75 | + for word in ( point_data['items'][0]['words'] ): |
| 76 | + wi = wi+1 |
| 77 | + if( word['confidence'] < confidence ): |
| 78 | + try: |
| 79 | + input_points.append(points[wi]) # 倒置的中文,优图识别不出来,置信度会低于0.5 |
| 80 | + except KeyError: |
| 81 | + continue |
| 82 | + |
| 83 | + if( len(input_points) > 2 or len(input_points) == 0 ): |
| 84 | + return [] # 7个字中只有2个倒置中文的成功率高 |
| 85 | + |
| 86 | + result = {} |
| 87 | + result['img_size']=[200,44] |
| 88 | + result['input_points']=input_points |
| 89 | + result = json.dumps(result) |
| 90 | + print(result) |
| 91 | + return result |
| 92 | + |
| 93 | +def bolting(k_low,k_hi,k3_confidence): |
| 94 | + ''' 筛选把握大的进行验证 ''' |
| 95 | + |
| 96 | + start = time.time() |
| 97 | + |
| 98 | + is_success = False |
| 99 | + while(is_success is not True): |
| 100 | + |
| 101 | + points_len = 1 |
| 102 | + angle = -20 |
| 103 | + img_ko = [] |
| 104 | + |
| 105 | + while(points_len != 21 or angle < k_low or angle > k_hi ): |
| 106 | + img_data = get_captcha(sessiona,headers) |
| 107 | + img_ko = recognition_captcha(img_data) |
| 108 | + |
| 109 | + ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False |
| 110 | + # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) |
| 111 | + # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因,也因为优图自身的原因,此处要特殊处理 |
| 112 | + |
| 113 | + # with open( "json.txt" ,'wb') as fb: |
| 114 | + # fb.write( img_ko_json ) |
| 115 | + |
| 116 | + try: |
| 117 | + points_len = len(img_ko['items'][0]['itemstring']) |
| 118 | + angle = img_ko['angle'] |
| 119 | + except Exception: |
| 120 | + points_len = 1 |
| 121 | + angle = -20 |
| 122 | + continue |
| 123 | + |
| 124 | + # print(img_ko_json.decode('utf8')) ## stdout用的是utf8,需转码才能正常显示 |
| 125 | + # print('-'*50) |
| 126 | + |
| 127 | + input_text = create_point( img_ko ,k3_confidence ) |
| 128 | + if(type(input_text) == type([])): |
| 129 | + continue |
| 130 | + |
| 131 | + data = { |
| 132 | + "input_text":input_text |
| 133 | + } |
| 134 | + |
| 135 | + # 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ,假装思考5秒钟 |
| 136 | + time.sleep( 4 + random.randint(1,9)/10 ) |
| 137 | + try: |
| 138 | + resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers) |
| 139 | + except Exception: |
| 140 | + continue |
| 141 | + |
| 142 | + print("angle: "+ str(angle) ) |
| 143 | + print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心 |
| 144 | + print('-'*50) |
| 145 | + try: |
| 146 | + is_success = json.loads(resp5.text)["success"] |
| 147 | + except KeyError: |
| 148 | + continue |
| 149 | + |
| 150 | + end = time.time() |
| 151 | + |
| 152 | + return end-start |
| 153 | + |
| 154 | + |
| 155 | +if __name__ == "__main__": |
| 156 | + |
| 157 | + sessiona = requests.Session() |
| 158 | + headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} |
| 159 | + |
| 160 | + k3_confidence = 0.71 |
| 161 | + |
| 162 | + ''' |
| 163 | + # 可视化数据会被保存在云端供浏览 |
| 164 | + # https://plot.ly/~weldon2010/4 |
| 165 | + # 纯属学习,并未看出"角度"范围扩大对图像识别的影响,大部分时候60s内能搞定,说明优图还是很强悍的,识别速度也非常快 |
| 166 | + ''' |
| 167 | + runtime_list_x = [] |
| 168 | + runtime_list_y = [] |
| 169 | + nn = range(1,11) # 愿意的话搞多线程,1百万次更有意思 |
| 170 | + |
| 171 | + # 成功尝试100次,形成2维数据以热力图的方式展示 |
| 172 | + for y in nn : |
| 173 | + for x in nn : |
| 174 | + runtime_list_x.append( bolting(-3,3,k3_confidence) ) |
| 175 | + print( "y: " + str(runtime_list_y) ) |
| 176 | + print( "x: " + str(runtime_list_x) ) |
| 177 | + runtime_list_y.append(runtime_list_x.copy()) |
| 178 | + runtime_list_x = [] |
| 179 | + |
| 180 | + print ("-"*30) |
| 181 | + print( runtime_list_y ) |
| 182 | + print ("-"*30) |
| 183 | + |
| 184 | + # pip install plotly 数据可视化 |
| 185 | + import plotly |
| 186 | + import plotly.graph_objs as go |
| 187 | + plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册 |
| 188 | + trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ]) |
| 189 | + data=[trace] |
| 190 | + plotly.plotly.plot(data, filename='weldon-time2-heatmap') |
| 191 | + |
| 192 | + # 尝试后发现一个特点,基本都是1~2个倒置中文,这样我们可以借此提速 |
| 193 | + # 角度范围放大,仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找 |
| 194 | + |
| 195 | +### chcp 65001 (win下改变cmd字符集) |
| 196 | +### python c:\python34\image_recognition_zhihu.py |
| 197 | + |
| 198 | + |
| 199 | + |
| 200 | + |
| 201 | + |
| 202 | + |
0 commit comments