Skip to content

Commit

Permalink
Move get_data_from_json to WebCrawler
Browse files Browse the repository at this point in the history
  • Loading branch information
Suwmlee committed Jul 28, 2021
1 parent ae2c2bc commit 2c41487
Show file tree
Hide file tree
Showing 3 changed files with 285 additions and 280 deletions.
13 changes: 0 additions & 13 deletions ADC_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,6 @@
from urllib.parse import urljoin


def get_data_state(data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False

if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False

if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False

return True


def getXpathSingle(htmlcode, xpath):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result1 = str(html.xpath(xpath)).strip(" ['']")
Expand Down
283 changes: 283 additions & 0 deletions WebCrawler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
import sys
sys.path.append('..')
import json
import re
from multiprocessing.pool import ThreadPool
import config
from ADC_function import translate

# =========website========
from . import airav
from . import avsox
from . import fanza
from . import fc2
from . import jav321
from . import javbus
from . import javdb
from . import mgstage
from . import xcity
# from . import javlib
from . import dlsite
from . import carib
from . import fc2club


def get_data_state(data: dict) -> bool: # 元数据获取失败检测
if "title" not in data or "number" not in data:
return False

if data["title"] is None or data["title"] == "" or data["title"] == "null":
return False

if data["number"] is None or data["number"] == "" or data["number"] == "null":
return False

return True

def get_data_from_json(file_number, filepath, conf: config.Config): # 从JSON返回元数据
"""
iterate through all services and fetch the data
"""

func_mapping = {
"airav": airav.main,
"avsox": avsox.main,
"fc2": fc2.main,
"fanza": fanza.main,
"javdb": javdb.main,
"javbus": javbus.main,
"mgstage": mgstage.main,
"jav321": jav321.main,
"xcity": xcity.main,
# "javlib": javlib.main,
"dlsite": dlsite.main,
"carib": carib.main,
"fc2club": fc2club.main
}

# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
if not len(conf.sources()) > 60:
# if the input file name matches certain rules,
# move some web service to the beginning of the list
lo_file_number = file_number.lower()
if "carib" in sources and (re.match(r"^\d{6}-\d{3}", file_number)
):
sources.insert(0, sources.pop(sources.index("carib")))
elif "avsox" in sources and (re.match(r"^\d{5,}", file_number) or
"heyzo" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("javdb")))
sources.insert(1, sources.pop(sources.index("avsox")))
elif "mgstage" in sources and (re.match(r"\d+\D+", file_number) or
"siro" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("mgstage")))
elif "fc2" in sources and ("fc2" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("javdb")))
sources.insert(1, sources.pop(sources.index("fc2")))
sources.insert(2, sources.pop(sources.index("fc2club")))
elif "dlsite" in sources and (
"rj" in lo_file_number or "vj" in lo_file_number
):
sources.insert(0, sources.pop(sources.index("dlsite")))

json_data = {}

if conf.multi_threading():
pool = ThreadPool(processes=len(conf.sources().split(',')))

# Set the priority of multi-thread crawling and join the multi-thread queue
for source in sources:
pool.apply_async(func_mapping[source], (file_number,))

# Get multi-threaded crawling response
for source in sources:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
# if any service return a valid return, break
if get_data_state(json_data):
break
pool.close()
pool.terminate()
else:
for source in sources:
try:
if conf.debug() == True:
print('[+]select', source)
json_data = json.loads(func_mapping[source](file_number))
# if any service return a valid return, break
if get_data_state(json_data):
break
except:
break

# Return if data not found in all sources
if not json_data:
print('[-]Movie Data not found!')
return

# ================================================网站规则添加结束================================================

title = json_data.get('title')
actor_list = str(json_data.get('actor')).strip("[ ]").replace("'", '').split(',') # 字符串转列表
actor_list = [actor.strip() for actor in actor_list] # 去除空白
release = json_data.get('release')
number = json_data.get('number')
studio = json_data.get('studio')
source = json_data.get('source')
runtime = json_data.get('runtime')
outline = json_data.get('outline')
label = json_data.get('label')
series = json_data.get('series')
year = json_data.get('year')

if json_data.get('cover_small') == None:
cover_small = ''
else:
cover_small = json_data.get('cover_small')

if json_data.get('trailer') == None:
trailer = ''
else:
trailer = json_data.get('trailer')

if json_data.get('extrafanart') == None:
extrafanart = ''
else:
extrafanart = json_data.get('extrafanart')

imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')

if title == '' or number == '':
print('[-]Movie Data not found!')
return

# if imagecut == '3':
# DownloadFileWithFilename()

# ====================处理异常字符====================== #\/:*?"<>|
title = title.replace('\\', '')
title = title.replace('/', '')
title = title.replace(':', '')
title = title.replace('*', '')
title = title.replace('?', '')
title = title.replace('"', '')
title = title.replace('<', '')
title = title.replace('>', '')
title = title.replace('|', '')
release = release.replace('/', '-')
tmpArr = cover_small.split(',')
if len(tmpArr) > 0:
cover_small = tmpArr[0].strip('\"').strip('\'')

# ====================处理异常字符 END================== #\/:*?"<>|

# === 替换Studio片假名
studio = studio.replace('アイエナジー','Energy')
studio = studio.replace('アイデアポケット','Idea Pocket')
studio = studio.replace('アキノリ','AKNR')
studio = studio.replace('アタッカーズ','Attackers')
studio = re.sub('アパッチ.*','Apache',studio)
studio = studio.replace('アマチュアインディーズ','SOD')
studio = studio.replace('アリスJAPAN','Alice Japan')
studio = studio.replace('オーロラプロジェクト・アネックス','Aurora Project Annex')
studio = studio.replace('クリスタル映像','Crystal 映像')
studio = studio.replace('グローリークエスト','Glory Quest')
studio = studio.replace('ダスッ!','DAS!')
studio = studio.replace('ディープス','DEEP’s')
studio = studio.replace('ドグマ','Dogma')
studio = studio.replace('プレステージ','PRESTIGE')
studio = studio.replace('ムーディーズ','MOODYZ')
studio = studio.replace('メディアステーション','宇宙企画')
studio = studio.replace('ワンズファクトリー','WANZ FACTORY')
studio = studio.replace('エスワン ナンバーワンスタイル','S1')
studio = studio.replace('エスワンナンバーワンスタイル','S1')
studio = studio.replace('SODクリエイト','SOD')
studio = studio.replace('サディスティックヴィレッジ','SOD')
studio = studio.replace('V&Rプロダクツ','V&R PRODUCE')
studio = studio.replace('V&RPRODUCE','V&R PRODUCE')
studio = studio.replace('レアルワークス','Real Works')
studio = studio.replace('マックスエー','MAX-A')
studio = studio.replace('ピーターズMAX','PETERS MAX')
studio = studio.replace('プレミアム','PREMIUM')
studio = studio.replace('ナチュラルハイ','NATURAL HIGH')
studio = studio.replace('マキシング','MAXING')
studio = studio.replace('エムズビデオグループ','M’s Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END

location_rule = eval(conf.location_rule())

if 'actor' in conf.location_rule() and len(actor) > 100:
print(conf.location_rule())
location_rule = eval(conf.location_rule().replace("actor","'多人作品'"))
maxlen = conf.max_title_len()
if 'title' in conf.location_rule() and len(title) > maxlen:
shorttitle = title[0:maxlen]
location_rule = location_rule.replace(title, shorttitle)

# 返回处理后的json_data
json_data['title'] = title
json_data['actor'] = actor
json_data['release'] = release
json_data['cover_small'] = cover_small
json_data['tag'] = tag
json_data['location_rule'] = location_rule
json_data['year'] = year
json_data['actor_list'] = actor_list
if conf.is_transalte():
translate_values = conf.transalte_values().split(",")
for translate_value in translate_values:
if json_data[translate_value] == "":
continue
# if conf.get_transalte_engine() == "baidu":
# json_data[translate_value] = translate(
# json_data[translate_value],
# target_language="zh",
# engine=conf.get_transalte_engine(),
# app_id=conf.get_transalte_appId(),
# key=conf.get_transalte_key(),
# delay=conf.get_transalte_delay(),
# )
if conf.get_transalte_engine() == "azure":
json_data[translate_value] = translate(
json_data[translate_value],
target_language="zh-Hans",
engine=conf.get_transalte_engine(),
key=conf.get_transalte_key(),
)
else:
json_data[translate_value] = translate(json_data[translate_value])

if conf.is_trailer():
if trailer:
json_data['trailer'] = trailer
else:
json_data['trailer'] = ''
else:
json_data['trailer'] = ''

if conf.is_extrafanart():
if extrafanart:
json_data['extrafanart'] = extrafanart
else:
json_data['extrafanart'] = ''
else:
json_data['extrafanart'] = ''

naming_rule=""
for i in conf.naming_rule().split("+"):
if i not in json_data:
naming_rule += i.strip("'").strip('"')
else:
naming_rule += json_data.get(i)
json_data['naming_rule'] = naming_rule
return json_data
Loading

0 comments on commit 2c41487

Please sign in to comment.