forked from mvdctop/Movie_Data_Capture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
269 lines (244 loc) · 12.1 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import json
import secrets
import config
from lxml import etree
from pathlib import Path
from ADC_function import delete_all_elements_in_list, delete_all_elements_in_str, file_modification_days, load_cookies, translate
from scrapinglib.api import search
def get_data_from_json(file_number, oCC, specified_source, specified_url):
"""
iterate through all services and fetch the data 从JSON返回元数据
"""
actor_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_actor.xml'))
info_mapping_data = etree.parse(str(Path.home() / '.local' / 'share' / 'mdc' / 'mapping_info.xml'))
conf = config.getInstance()
# default fetch order list, from the beginning to the end
sources = conf.sources()
# TODO 准备参数
# - 清理 ADC_function, webcrawler
proxies = None
configProxy = conf.proxy()
if configProxy.enable:
proxies = configProxy.proxies()
javdb_sites = conf.javdb_sites().split(',')
for i in javdb_sites:
javdb_sites[javdb_sites.index(i)] = "javdb" + i
javdb_sites.append("javdb")
# 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天
has_json = False
for cj in javdb_sites:
javdb_site = cj
cookie_json = javdb_site + '.json'
cookies_dict, cookies_filepath = load_cookies(cookie_json)
if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str):
cdays = file_modification_days(cookies_filepath)
if cdays < 7:
javdb_cookies = cookies_dict
has_json = True
break
elif cdays != 9999:
print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.')
if not has_json:
javdb_site = secrets.choice(javdb_sites)
javdb_cookies = None
cacert =None
if conf.cacert_file():
cacert = conf.cacert_file()
json_data = search(file_number, sources, proxies=proxies, verify=cacert,
dbsite=javdb_site, dbcookies=javdb_cookies,
morestoryline=conf.is_storyline(),
specifiedSource=specified_source, specifiedUrl=specified_url)
# Return if data not found in all sources
if not json_data:
print('[-]Movie Number not found!')
return None
# 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障
# 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
# 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
# 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。
if str(json_data.get('number')).upper() != file_number.upper():
try:
if json_data.get('allow_number_change'):
pass
except:
print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
return None
# ================================================网站规则添加结束================================================
title = json_data.get('title')
actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list] # 去除空白
director = json_data.get('director')
release = json_data.get('release')
number = json_data.get('number')
studio = json_data.get('studio')
source = json_data.get('source')
runtime = json_data.get('runtime')
outline = json_data.get('outline')
label = json_data.get('label')
series = json_data.get('series')
year = json_data.get('year')
if json_data.get('cover_small'):
cover_small = json_data.get('cover_small')
else:
cover_small = ''
if json_data.get('trailer'):
trailer = json_data.get('trailer')
else:
trailer = ''
if json_data.get('extrafanart'):
extrafanart = json_data.get('extrafanart')
else:
extrafanart = ''
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
while 'XXXX' in tag:
tag.remove('XXXX')
while 'xxx' in tag:
tag.remove('xxx')
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '':
print('[-]Movie Number or Title not found!')
return None
# if imagecut == '3':
# DownloadFileWithFilename()
# ====================处理异常字符====================== #\/:*?"<>|
actor = special_characters_replacement(actor)
actor_list = [special_characters_replacement(a) for a in actor_list]
title = special_characters_replacement(title)
label = special_characters_replacement(label)
outline = special_characters_replacement(outline)
series = special_characters_replacement(series)
studio = special_characters_replacement(studio)
director = special_characters_replacement(director)
tag = [special_characters_replacement(t) for t in tag]
release = release.replace('/', '-')
tmpArr = cover_small.split(',')
if len(tmpArr) > 0:
cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>|
# 返回处理后的json_data
json_data['title'] = title
json_data['original_title'] = title
json_data['actor'] = actor
json_data['release'] = release
json_data['cover_small'] = cover_small
json_data['tag'] = tag
json_data['year'] = year
json_data['actor_list'] = actor_list
json_data['trailer'] = trailer
json_data['extrafanart'] = extrafanart
json_data['label'] = label
json_data['outline'] = outline
json_data['series'] = series
json_data['studio'] = studio
json_data['director'] = director
if conf.is_translate():
translate_values = conf.translate_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
if translate_value == "title":
title_dict = json.loads(
(Path.home() / '.local' / 'share' / 'mdc' / 'c_number.json').read_text(encoding="utf-8"))
try:
json_data[translate_value] = title_dict[number]
continue
except:
pass
if conf.get_translate_engine() == "azure":
t = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_translate_engine(),
key=conf.get_translate_key(),
)
else:
t = translate(json_data[translate_value])
if len(t):
json_data[translate_value] = special_characters_replacement(t)
if oCC:
cc_vars = conf.cc_convert_vars().split(",")
ccm = conf.cc_convert_mode()
def convert_list(mapping_data,language,vars):
total = []
for i in vars:
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")) != 0:
i = mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=f",{i},")[0]
total.append(i)
return total
def convert(mapping_data,language,vars):
if len(mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)) != 0:
return mapping_data.xpath('a[contains(@keyword, $name)]/@' + language, name=vars)[0]
else:
raise IndexError('keyword not found')
for cc in cc_vars:
if json_data[cc] == "" or len(json_data[cc]) == 0:
continue
if cc == "actor":
try:
if ccm == 1:
json_data['actor_list'] = convert_list(actor_mapping_data, "zh_cn", json_data['actor_list'])
json_data['actor'] = convert(actor_mapping_data, "zh_cn", json_data['actor'])
elif ccm == 2:
json_data['actor_list'] = convert_list(actor_mapping_data, "zh_tw", json_data['actor_list'])
json_data['actor'] = convert(actor_mapping_data, "zh_tw", json_data['actor'])
elif ccm == 3:
json_data['actor_list'] = convert_list(actor_mapping_data, "jp", json_data['actor_list'])
json_data['actor'] = convert(actor_mapping_data, "jp", json_data['actor'])
except:
json_data['actor_list'] = [oCC.convert(aa) for aa in json_data['actor_list']]
json_data['actor'] = oCC.convert(json_data['actor'])
elif cc == "tag":
try:
if ccm == 1:
json_data[cc] = convert_list(info_mapping_data, "zh_cn", json_data[cc])
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = convert_list(info_mapping_data, "zh_tw", json_data[cc])
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = convert_list(info_mapping_data, "jp", json_data[cc])
json_data[cc] = delete_all_elements_in_list("删除", json_data[cc])
except:
json_data[cc] = [oCC.convert(t) for t in json_data[cc]]
else:
try:
if ccm == 1:
json_data[cc] = convert(info_mapping_data, "zh_cn", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 2:
json_data[cc] = convert(info_mapping_data, "zh_tw", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
elif ccm == 3:
json_data[cc] = convert(info_mapping_data, "jp", json_data[cc])
json_data[cc] = delete_all_elements_in_str("删除", json_data[cc])
except IndexError:
json_data[cc] = oCC.convert(json_data[cc])
except:
pass
naming_rule=""
for i in conf.naming_rule().split("+"):
if i not in json_data:
naming_rule += i.strip("'").strip('"')
else:
item = json_data.get(i)
naming_rule += item if type(item) is not list else "&".join(item)
json_data['naming_rule'] = naming_rule
return json_data
def special_characters_replacement(text) -> str:
if not isinstance(text, str):
return text
return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane
replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane
replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
replace('…','…').
replace('&', '&')
)