forked from mvdctop/Movie_Data_Capture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathairav.py
229 lines (214 loc) · 9.1 KB
/
airav.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import sys
sys.path.append('../')
import re
from pyquery import PyQuery as pq#need install
from lxml import etree#need install
from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
from WebCrawler import javbus
'''
API
注册:https://www.airav.wiki/api/auth/signup
设置:https://www.airav.wiki/api/get_web_settings
搜索:https://www.airav.wiki/api/video/list?lng=zh-CN&search=
搜索:https://www.airav.wiki/api/video/list?lang=zh-TW&lng=zh-TW&search=
'''
host = 'https://www.airav.wiki'
# airav这个网站没有演员图片,所以直接使用javbus的图
def getActorPhoto(javbus_json):
result = javbus_json.get('actor_photo')
if isinstance(result, dict) and len(result):
return result
return ''
def getTitle(htmlcode): #获取标题
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
return result
def getStudio(htmlcode, javbus_json): #获取厂商 已修改
# javbus如果有数据以它为准
result = javbus_json.get('studio')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode,etree.HTMLParser())
return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
def getYear(htmlcode, javbus_json): #获取年份
result = javbus_json.get('year')
if isinstance(result, str) and len(result):
return result
release = getRelease(htmlcode, javbus_json)
if len(release) != len('2000-01-01'):
return ''
return release[:4]
def getCover(htmlcode, javbus_json): #获取封面图片
result = javbus_json.get('cover')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
def getRelease(htmlcode, javbus_json): #获取出版日期
result = javbus_json.get('release')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
except:
return ''
return result
def getRuntime(javbus_json): #获取播放时长
result = javbus_json.get('runtime')
if isinstance(result, str) and len(result):
return result
return ''
# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先
def getActor(htmlcode, javbus_json): #获取女优
b=[]
html = etree.fromstring(htmlcode, etree.HTMLParser())
a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
for v in a:
v = v.strip()
if len(v):
b.append(v)
if len(b):
return b
result = javbus_json.get('actor')
if isinstance(result, list) and len(result):
return result
return []
def getNum(htmlcode, javbus_json): #获取番号
result = javbus_json.get('number')
if isinstance(result, str) and len(result):
return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
title = str(html.xpath('/html/head/title/text()')[0])
result = str(re.findall('^\[(.*?)]', title)[0])
return result
def getDirector(javbus_json): #获取导演 已修改
result = javbus_json.get('director')
if isinstance(result, str) and len(result):
return result
return ''
def getOutline(htmlcode): #获取概述
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
return result
except:
return ''
def getSerise(javbus_json): #获取系列 已修改
result = javbus_json.get('series')
if isinstance(result, str) and len(result):
return result
return ''
def getTag(htmlcode): # 获取标签
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
x = soup.find_all(attrs={'class': 'tagBtnMargin'})
a = x[0].find_all('a')
for i in a:
tag.append(i.get_text())
return tag
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'<div class=\"mobileImgThumbnail\">[\s\S]*?</div></div></div></div>')
html = html_pather.search(htmlcode)
if html:
html = html.group()
extrafanart_pather = re.compile(r'<img.*?src=\"(.*?)\"')
extrafanart_imgs = extrafanart_pather.findall(html)
if extrafanart_imgs:
return extrafanart_imgs
return ''
def search(keyword): #搜索,返回结果
result = []
page = 1
while page > 0:
# search_result = {"offset": 0,"count": 4,"result": [
# {"vid": "99-07-15076","slug": "Wrop6o","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
# "url": "","view": 98,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15076.jpg","barcode": "_1pondo_012717_472"},
# {"vid": "99-27-00286","slug": "DlPEua","name": "放課後に、仕込んでください 〜優等生は無言でスカートを捲り上げる〜",
# "url": "","view": 69,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00286.jpg","barcode": "caribbeancom012717-360"},
# {"vid": "99-07-15070","slug": "VLS3WY","name": "放課後に、仕込んでください ~優等生は無言でスカートを捲り上げる~ ももき希",
# "url": "","view": 58,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-07-15070.jpg","barcode": "caribbeancom_012717-360"},
# {"vid": "99-27-00287","slug": "YdMVb3","name": "朝ゴミ出しする近所の遊び好きノーブラ奥さん 江波りゅう",
# "url": "","view": 56,"img_url": "https://wiki-img.airav.wiki/storage/big_pic/99-27-00287.jpg","barcode": "1pondo_012717_472"}
# ],"status": "ok"}
search_result = get_html(host + '/api/video/list?lang=zh-TW&lng=jp&search=' + keyword + '&page=' + str(page))
try:
json_data = json.loads(search_result)
except json.decoder.JSONDecodeError:
print("[-]Json decoder error!")
return []
result_offset = int(json_data["offset"])
result_count = int(json_data["count"])
result_size = len(json_data["result"])
if result_count <= 0 or result_size <= 0:
return result
elif result_count > result_offset + result_size: #请求下一页内容
result.extend(json_data["result"])
page += 1
elif result_count == result_offset + result_size: #请求最后一页内容
result.extend(json_data["result"])
page = 0
else:
page = 0
return result
def main(number):
try:
try:
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
javbus_json = json.loads(javbus.main(number))
except:
print(number)
dic = {
# 标题可使用airav
'title': getTitle(htmlcode),
# 制作商先找javbus,如果没有再找本站
'studio': getStudio(htmlcode, javbus_json),
# 年份先试javbus,如果没有再找本站
'year': getYear(htmlcode, javbus_json),
# 简介 使用 airav
'outline': getOutline(htmlcode),
# 使用javbus
'runtime': getRuntime(javbus_json),
# 导演 使用javbus
'director': getDirector(javbus_json),
# 演员 先试airav
'actor': getActor(htmlcode, javbus_json),
# 发售日先试javbus
'release': getRelease(htmlcode, javbus_json),
# 番号使用javbus
'number': getNum(htmlcode, javbus_json),
# 封面链接 使用javbus
'cover': getCover(htmlcode, javbus_json),
# 剧照获取
'extrafanart': getExtrafanart(htmlcode),
'imagecut': 1,
# 使用 airav
'tag': getTag(htmlcode),
# 使用javbus
'label': getSerise(javbus_json),
# 妈的,airav不提供作者图片
# 'actor_photo': getActorPhoto(javbus_json),
'website': 'https://www.airav.wiki/video/' + number,
'source': 'airav.py',
# 使用javbus
'series': getSerise(javbus_json)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
if config.getInstance().debug():
print(e)
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js
if __name__ == '__main__':
print(main('ADV-R0624')) # javbus页面返回404, airav有数据
print(main('ADN-188')) # 一人
print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字