forked from mvdctop/Movie_Data_Capture
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path__init__.py
296 lines (266 loc) · 11.9 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import json
import re
from multiprocessing.pool import ThreadPool
import config
from ADC_function import translate
# =========website========
from . import airav
from . import avsox
from . import fanza
from . import fc2
from . import jav321
from . import javbus
from . import javdb
from . import mgstage
from . import xcity
# from . import javlib
from . import dlsite
from . import carib
from . import fc2club
def get_data_state(data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False
if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False
if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False
return True
def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数据
"""
iterate through all services and fetch the data
"""
func_mapping = {
"airav": airav.main,
"avsox": avsox.main,
"fc2": fc2.main,
"fanza": fanza.main,
"javdb": javdb.main,
"javbus": javbus.main,
"mgstage": mgstage.main,
"jav321": jav321.main,
"xcity": xcity.main,
# "javlib": javlib.main,
"dlsite": dlsite.main,
"carib": carib.main,
"fc2club": fc2club.main
}
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
if not len(conf.sources()) > 80:
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
):
sources.insert(0, sources.pop(sources.index("carib")))
elif re.match(r"^\d{5,}", file_number) or "heyzo" in lo_file_number:
if "javdb" in sources:
sources.insert(0, sources.pop(sources.index("javdb")))
if "avsox" in sources:
sources.insert(0, sources.pop(sources.index("avsox")))
elif "mgstage" in sources and (re.match(r"\d+\D+", file_number) or
"siro" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("mgstage")))
elif "fc2" in lo_file_number:
if "javdb" in sources:
sources.insert(0, sources.pop(sources.index("javdb")))
if "fc2" in sources:
sources.insert(0, sources.pop(sources.index("fc2")))
if "fc2club" in sources:
sources.insert(0, sources.pop(sources.index("fc2club")))
elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("dlsite")))
elif re.match(r"^[a-z0-9]{3,}$", lo_file_number):
if "javdb" in sources:
sources.insert(0, sources.pop(sources.index("javdb")))
if "xcity" in sources:
sources.insert(0, sources.pop(sources.index("xcity")))
# check sources in func_mapping
todel = []
for s in sources:
if not s in func_mapping:
print('[!] Source Not Exist : ' + s)
todel.append(s)
for d in todel:
print('[!] Remove Source : ' + s)
sources.remove(d)
json_data = {}
if conf.multi_threading():
pool = ThreadPool(processes=len(conf.sources().split(',')))
# Set the priority of multi-thread crawling and join the multi-thread queue
for source in sources:
pool.apply_async(func_mapping[source], (file_number,))
# Get multi-threaded crawling response
for source in sources:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
# if any service return a valid return, break
if get_data_state(json_data):
break
pool.close()
pool.terminate()
else:
for source in sources:
try:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(func_mapping[source](file_number))
# if any service return a valid return, break
if get_data_state(json_data):
break
except:
break
# Return if data not found in all sources
if not json_data:
print('[-]Movie Number not found!')
return None
# ================================================网站规则添加结束================================================
title = json_data.get('title')
actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list] # 去除空白
director = json_data.get('director')
release = json_data.get('release')
number = json_data.get('number')
studio = json_data.get('studio')
source = json_data.get('source')
runtime = json_data.get('runtime')
outline = json_data.get('outline')
label = json_data.get('label')
series = json_data.get('series')
year = json_data.get('year')
if json_data.get('cover_small'):
cover_small = json_data.get('cover_small')
else:
cover_small = ''
if json_data.get('trailer'):
trailer = json_data.get('trailer')
else:
trailer = ''
if json_data.get('extrafanart'):
extrafanart = json_data.get('extrafanart')
else:
extrafanart = ''
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '':
print('[-]Movie Number or Title not found!')
return None
# if imagecut == '3':
# DownloadFileWithFilename()
# ====================处理异常字符====================== #\/:*?"<>|
actor = special_characters_replacement(actor)
actor_list = [special_characters_replacement(a) for a in actor_list]
title = special_characters_replacement(title)
label = special_characters_replacement(label)
outline = special_characters_replacement(outline)
series = special_characters_replacement(series)
studio = special_characters_replacement(studio)
director = special_characters_replacement(director)
tag = [special_characters_replacement(t) for t in tag]
release = release.replace('/', '-')
tmpArr = cover_small.split(',')
if len(tmpArr) > 0:
cover_small = tmpArr[0].strip('\"').strip('\'')
# ====================处理异常字符 END================== #\/:*?"<>|
# === 替换Studio片假名
studio = studio.replace('アイエナジー','Energy')
studio = studio.replace('アイデアポケット','Idea Pocket')
studio = studio.replace('アキノリ','AKNR')
studio = studio.replace('アタッカーズ','Attackers')
studio = re.sub('アパッチ.*','Apache',studio)
studio = studio.replace('アマチュアインディーズ','SOD')
studio = studio.replace('アリスJAPAN','Alice Japan')
studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex')
studio = studio.replace('クリスタル映像','Crystal 映像')
studio = studio.replace('グローリークエスト','Glory Quest')
studio = studio.replace('ダスッ!','DAS!')
studio = studio.replace('ディープス','DEEP’s')
studio = studio.replace('ドグマ','Dogma')
studio = studio.replace('プレステージ','PRESTIGE')
studio = studio.replace('ムーディーズ','MOODYZ')
studio = studio.replace('メディアステーション','宇宙企画')
studio = studio.replace('ワンズファクトリー','WANZ FACTORY')
studio = studio.replace('エスワン ナンバーワンスタイル','S1')
studio = studio.replace('エスワンナンバーワンスタイル','S1')
studio = studio.replace('SODクリエイト','SOD')
studio = studio.replace('サディスティックヴィレッジ','SOD')
studio = studio.replace('V&Rプロダクツ','V&R PRODUCE')
studio = studio.replace('V&RPRODUCE','V&R PRODUCE')
studio = studio.replace('レアルワークス','Real Works')
studio = studio.replace('マックスエー','MAX-A')
studio = studio.replace('ピーターズMAX','PETERS MAX')
studio = studio.replace('プレミアム','PREMIUM')
studio = studio.replace('ナチュラルハイ','NATURAL HIGH')
studio = studio.replace('マキシング','MAXING')
studio = studio.replace('エムズビデオグループ','M’s Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END
# 返回处理后的json_data
json_data['title'] = title
json_data['actor'] = actor
json_data['release'] = release
json_data['cover_small'] = cover_small
json_data['tag'] = tag
json_data['year'] = year
json_data['actor_list'] = actor_list
json_data['trailer'] = trailer
json_data['extrafanart'] = extrafanart
json_data['label'] = label
json_data['outline'] = outline
json_data['series'] = series
json_data['studio'] = studio
json_data['director'] = director
if conf.is_transalte():
translate_values = conf.transalte_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
t = ""
# if conf.get_transalte_engine() == "baidu":
# json_data[translate_value] = translate(
# json_data[translate_value],
# target_language="zh",
# engine=conf.get_transalte_engine(),
# app_id=conf.get_transalte_appId(),
# key=conf.get_transalte_key(),
# delay=conf.get_transalte_delay(),
# )
if conf.get_transalte_engine() == "azure":
t = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_transalte_engine(),
key=conf.get_transalte_key(),
)
else:
t = translate(json_data[translate_value])
if len(t):
json_data[translate_value] = special_characters_replacement(t)
naming_rule=""
for i in conf.naming_rule().split("+"):
if i not in json_data:
naming_rule += i.strip("'").strip('"')
else:
naming_rule += json_data.get(i)
json_data['naming_rule'] = naming_rule
return json_data
def special_characters_replacement(text) -> str:
if not isinstance(text, str):
return text
return (text.replace('\\', '∖'). # U+2216 SET MINUS @ Basic Multilingual Plane
replace('/', '∕'). # U+2215 DIVISION SLASH @ Basic Multilingual Plane
replace(':', '꞉'). # U+A789 MODIFIER LETTER COLON @ Latin Extended-D
replace('*', '∗'). # U+2217 ASTERISK OPERATOR @ Basic Multilingual Plane
replace('?', '?'). # U+FF1F FULLWIDTH QUESTION MARK @ Basic Multilingual Plane
replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
replace('|', 'ǀ')) # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane