diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6b0a748e7..289c88ea4 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,7 +33,7 @@ jobs: - name: Test number_perser.get_number run: | python number_parser.py -v - + - name: Build with PyInstaller for macos/ubuntu if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest' run: | @@ -42,6 +42,8 @@ jobs: --hidden-import ADC_function.py \ --hidden-import core.py \ --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ - name: Build with PyInstaller for windows if: matrix.os == 'windows-latest' @@ -51,6 +53,8 @@ jobs: --hidden-import ADC_function.py ` --hidden-import core.py ` --add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" ` + --add-data "Img;Img" ` + --add-data "config.ini;." ` - name: Copy config.ini run: | diff --git a/ADC_function.py b/ADC_function.py index b13d0b4d3..36be65773 100755 --- a/ADC_function.py +++ b/ADC_function.py @@ -1,8 +1,8 @@ from os import replace import requests -import hashlib +#import hashlib from pathlib import Path -import random +import secrets import os.path import uuid import json @@ -20,12 +20,12 @@ def getXpathSingle(htmlcode, xpath): return result1 -G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36' +G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36' # 网页请求核心 def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None): - verify = config.Config().cacert_file() - configProxy = config.Config().proxy() + verify = config.getInstance().cacert_file() + configProxy = config.getInstance().proxy() errors = "" if ua is None: @@ -61,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None) def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: - configProxy = config.Config().proxy() + configProxy = config.getInstance().proxy() errors = "" headers_ua = {"User-Agent": G_USER_AGENT} if headers is None: @@ -85,8 +85,12 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response: def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None): - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) - configProxy = config.Config().proxy() + s = None + if isinstance(cookies, dict) and len(cookies): + s = requests.Session() + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) + configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() result = browser.open(url) @@ -103,17 +107,19 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: return result.text -def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): - browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua) - if isinstance(cookies, dict): - requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies) - configProxy = config.Config().proxy() +def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None): + s = None + if isinstance(cookies, dict) and len(cookies): + s = requests.Session() + requests.utils.add_dict_to_cookiejar(s.cookies, cookies) + browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s) + configProxy = config.getInstance().proxy() if configProxy.enable: browser.session.proxies = configProxy.proxies() result = browser.open(url) if not result.ok: return '' - form = browser.select_form() if form_name is None else browser.select_form(form_name) + form = browser.select_form() if form_select is None else browser.select_form(form_select) if isinstance(fields, dict): for k, v in fields.items(): browser[k] = v @@ -131,7 +137,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d # def get_javlib_cookie() -> [dict, str]: # import cloudscraper -# switch, proxy, timeout, retry_count, proxytype = config.Config().proxy() +# switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy() # proxies = get_proxy(proxy, proxytype) # # raw_cookie = {} @@ -158,7 +164,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d def translateTag_to_sc(tag): - tranlate_to_sc = config.Config().transalte_to_sc() + tranlate_to_sc = config.getInstance().transalte_to_sc() if tranlate_to_sc: dict_gen = {'中文字幕': '中文字幕', '高清': 'XXXX', '字幕': 'XXXX', '推薦作品': '推荐作品', '通姦': '通奸', '淋浴': '淋浴', '舌頭': '舌头', @@ -505,8 +511,11 @@ def translate( delay: int = 0, ): trans_result = "" + # 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的 + if not is_japanese(src): + return src if engine == "google-free": - gsite = config.Config().get_translate_service_site() + gsite = config.getInstance().get_translate_service_site() if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite): gsite = 'translate.google.cn' url = ( @@ -521,7 +530,7 @@ def translate( trans_result = trans_result.join(translate_list) # elif engine == "baidu": # url = "https://fanyi-api.baidu.com/api/trans/vip/translate" - # salt = random.randint(1, 1435660288) + # salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288) # sign = app_id + src + str(salt) + key # sign = hashlib.md5(sign.encode()).hexdigest() # url += ( @@ -560,17 +569,6 @@ def translate( return trans_result -# ========================================================================是否为无码 -def is_uncensored(number): - if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper(): - return True - configs = config.Config().get_uncensored() - prefix_list = str(configs).split(',') - for pre in prefix_list: - if pre.upper() in number.upper(): - return True - return False - # 从浏览器中导出网站登录验证信息的cookies,能够以会员方式打开游客无法访问到的页面 # 示例: FC2-755670 url https://javdb9.com/v/vO8Mn # json 文件格式 @@ -593,20 +591,20 @@ def load_cookies(filename): filename = os.path.basename(filename) if not len(filename): return None, None - path_search_order = [ - f"./{filename}", - os.path.join(Path.home(), filename), - os.path.join(Path.home(), f".avdc/{filename}"), - os.path.join(Path.home(), f".local/share/avdc/{filename}") -] + path_search_order = ( + Path.cwd() / filename, + Path.home() / filename, + Path.home() / f".avdc/{filename}", + Path.home() / f".local/share/avdc/{filename}" + ) cookies_filename = None - for p in path_search_order: - if os.path.exists(p): - cookies_filename = os.path.abspath(p) - break - if not cookies_filename: - return None, None try: + for p in path_search_order: + if p.is_file(): + cookies_filename = str(p.resolve()) + break + if not cookies_filename: + return None, None return json.load(open(cookies_filename)), cookies_filename except: return None, None @@ -623,10 +621,9 @@ def file_modification_days(filename) -> int: return 9999 return days -# 检查文件是否是链接 -def is_link(filename: str): - if os.path.islink(filename): - return True # symlink - elif os.stat(filename).st_nlink > 1: - return True # hard link Linux MAC OSX Windows NTFS - return False +def file_not_exist_or_empty(filepath) -> bool: + return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0 + +# 日语简单检测 +def is_japanese(s) -> bool: + return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE)) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index c1c7ee4b3..e87be0380 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -6,12 +6,13 @@ import shutil import typing import urllib3 +import signal import config from datetime import datetime, timedelta import time from pathlib import Path -from ADC_function import file_modification_days, get_html, is_link +from ADC_function import file_modification_days, get_html from number_parser import get_number from core import core_main, moveFailedFolder @@ -35,30 +36,54 @@ def check_update(local_version): def argparse_function(ver: str) -> typing.Tuple[str, str, bool]: - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + conf = config.getInstance() + parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.") parser.add_argument("file", default='', nargs='?', help="Single Movie file path.") parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.") - # parser.add_argument("-c", "--config", default='config.ini', nargs='?', help="The config file Path.") - default_logdir = os.path.join(Path.home(),'.avlogs') + parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder") + parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.") + # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.") + default_logdir = str(Path.home() / '.avlogs') parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?', - help=f"""Duplicate stdout and stderr to logfiles -in logging folder, default on. -default for current user: {default_logdir} -Use --log-dir= to turn off logging feature.""") - parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number") - parser.add_argument("-a", "--auto-exit", dest='autoexit', action="store_true", - help="Auto exit after program complete") + help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on. + default folder for current user: '{default_logdir}'. Change default folder to an empty file, + or use --log-dir= to turn log off.""") parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.") + parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.") + parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.") + parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format( + os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt'))) + parser.add_argument("-a", "--auto-exit", action="store_true", + help="Auto exit after program complete") + parser.add_argument("-g","--debug", action="store_true", + help="Turn on debug mode to generate diagnostic log for issue report.") + parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true", + help="""Only show job list of files and numbers, and **NO** actual operation +is performed. It may help you correct wrong numbers before real job.""") parser.add_argument("-v", "--version", action="version", version=ver) + #ini_path args = parser.parse_args() - - return args.file, args.path, args.number, args.autoexit, args.logdir, args.regexstr - + def get_natural_number_or_none(value): + return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None + def get_str_or_none(value): + return value if isinstance(value, str) and len(value) else None + def get_bool_or_none(value): + return True if isinstance(value, bool) and value else None + config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode) + config.G_conf_override["common:source_folder"] = get_str_or_none(args.path) + config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit) + config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days) + config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt) + config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list) + config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug) + + return args.file, args.number, args.logdir, args.regexstr, args.zero_op class OutLogger(object): def __init__(self, logfile) -> None: self.term = sys.stdout self.log = open(logfile,"w",encoding='utf-8',buffering=1) + self.filepath = logfile def __del__(self): self.close() def __enter__(self): @@ -85,6 +110,7 @@ class ErrLogger(OutLogger): def __init__(self, logfile) -> None: self.term = sys.stderr self.log = open(logfile,"w",encoding='utf-8',buffering=1) + self.filepath = logfile def close(self): if self.term != None: sys.stderr = self.term @@ -97,14 +123,18 @@ def close(self): def dupe_stdout_to_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0: return - if not os.path.isdir(logdir): - os.makedirs(logdir) - if not os.path.isdir(logdir): - return - + log_dir = Path(logdir) + if not log_dir.exists(): + try: + log_dir.mkdir(parents=True,exist_ok=True) + except: + pass + if not log_dir.is_dir(): + return # Tips for disabling logs by change directory to a same name empty regular file + abslog_dir = log_dir.resolve() log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S") - logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt') - errlog = os.path.join(logdir, f'avdc_{log_tmstr}_err.txt') + logfile = abslog_dir / f'avdc_{log_tmstr}.txt' + errlog = abslog_dir / f'avdc_{log_tmstr}_err.txt' sys.stdout = OutLogger(logfile) sys.stderr = ErrLogger(errlog) @@ -113,28 +143,126 @@ def dupe_stdout_to_logfile(logdir: str): def close_logfile(logdir: str): if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir): return + #日志关闭前保存日志路径 + filepath = None + try: + filepath = sys.stdout.filepath + except: + pass sys.stdout.close() sys.stderr.close() + log_dir = Path(logdir).resolve() + if isinstance(filepath, Path): + print(f"Log file '{filepath}' saved.") + assert(filepath.parent.samefile(log_dir)) # 清理空文件 - for current_dir, subdirs, files in os.walk(logdir, topdown=False): + for f in log_dir.glob(r'*_err.txt'): + if f.stat().st_size == 0: + try: + f.unlink(missing_ok=True) + except: + pass + # 合并日志 只检测日志目录内的文本日志,忽略子目录。三天前的日志,按日合并为单个日志,三个月前的日志, + # 按月合并为单个月志,去年及以前的月志,今年4月以后将之按年合并为年志 + # 测试步骤: + """ + LOGDIR=/tmp/avlog + mkdir -p $LOGDIR + for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done + for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done + for f in {00..23};do;echo 20211001T$f>$LOGDIR/avdc_20211001T${f}5959.txt;done + echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR" + # 1932 files in /tmp/avlog + avdc -zgic1 -d0 -m3 -o $LOGDIR + # python3 ./AV_Data_Capture.py -zgic1 -o $LOGDIR + ls $LOGDIR + # rm -rf $LOGDIR + """ + today = datetime.today() + # 第一步,合并到日。3天前的日志,文件名是同一天的合并为一份日志 + for i in range(1): + txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}T\d{6}$', f.stem, re.A)] + if not txts or not len(txts): + break + e = [f for f in txts if '_err' in f.stem] + txts.sort() + tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99") + deadline_day = f'avdc_{tmstr_3_days_ago}' + day_merge = [f for f in txts if f.stem < deadline_day] + if not day_merge or not len(day_merge): + break + cutday = len('T235959.txt') # cut length avdc_20201201|T235959.txt + for f in day_merge: + try: + day_file_name = str(f)[:-cutday] + '.txt' # avdc_20201201.txt + with open(day_file_name, 'a', encoding='utf-8') as m: + m.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) + except: + pass + # 第二步,合并到月 + for i in range(1): # 利用1次循环的break跳到第二步,避免大块if缩进或者使用goto语法 + txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}$', f.stem, re.A)] + if not txts or not len(txts): + break + txts.sort() + tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32") + deadline_month = f'avdc_{tmstr_3_month_ago}' + month_merge = [f for f in txts if f.stem < deadline_month] + if not month_merge or not len(month_merge): + break + tomonth = len('01.txt') # cut length avdc_202012|01.txt + for f in month_merge: + try: + month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt + with open(month_file_name, 'a', encoding='utf-8') as m: + m.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) + except: + pass + # 第三步,月合并到年 + if today.month < 4: + return + mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{6}$', f.stem, re.A)] + if not mons or not len(mons): + return + mons.sort() + deadline_year = f'avdc_{today.year-1}13' + year_merge = [f for f in mons if f.stem < deadline_year] + if not year_merge or not len(year_merge): + return + toyear = len('12.txt') # cut length avdc_2020|12.txt + for f in year_merge: try: - for f in files: - full_name = os.path.join(current_dir, f) - if os.path.getsize(full_name) == 0: - os.remove(full_name) + year_file_name = str(f)[:-toyear] + '.txt' # avdc_2020.txt + with open(year_file_name, 'a', encoding='utf-8') as y: + y.write(f.read_text(encoding='utf-8')) + f.unlink(missing_ok=True) except: pass + # 第四步,压缩年志 如果有压缩需求,请自行手工压缩,或者使用外部脚本来定时完成。推荐nongnu的lzip,对于 + # 这种粒度的文本日志,压缩比是目前最好的。lzip -9的运行参数下,日志压缩比要高于xz -9,而且内存占用更少, + # 多核利用率更高(plzip多线程版本),解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右, + # 100MB的日志文件能缩小到3.7MB。 + + +def signal_handler(*args): + print('[!]Ctrl+C detected, Exit.') + sys.exit(9) + +def sigdebug_handler(*args): + config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"] + print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF')) -# 重写视频文件扫描,消除递归,取消全局变量,新增失败文件列表跳过处理 -def movie_lists(root, conf, regexstr): - escape_folder = re.split("[,,]", conf.escape_folder()) +# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理,提示跳过视频总数,调试模式(-g)下详细被跳过文件,跳过小广告 +def movie_lists(source_folder, regexstr): + conf = config.getInstance() main_mode = conf.main_mode() debug = conf.debug() nfo_skip_days = conf.nfo_skip_days() soft_link = conf.soft_link() - total = [] - file_type = conf.media_type().upper().split(",") + file_type = conf.media_type().lower().split(",") trailerRE = re.compile(r'-trailer\.', re.IGNORECASE) cliRE = None if isinstance(regexstr, str) and len(regexstr): @@ -142,72 +270,94 @@ def movie_lists(root, conf, regexstr): cliRE = re.compile(regexstr, re.IGNORECASE) except: pass + failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt' failed_set = set() - if main_mode == 3 or soft_link: + if (main_mode == 3 or soft_link) and not conf.ignore_failed_list(): try: - with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'r', encoding='utf-8') as flt: - flist = flt.read().splitlines() - failed_set = set(flist) - flt.close() - if len(flist) != len(failed_set): - with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'w', encoding='utf-8') as flt: - flt.writelines([line + '\n' for line in failed_set]) - flt.close() + flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines() + failed_set = set(flist) + if len(flist) != len(failed_set): # 检查去重并写回,但是不改变failed_list.txt内条目的先后次序,重复的只保留最后的 + fset = failed_set.copy() + for i in range(len(flist)-1, -1, -1): + fset.remove(flist[i]) if flist[i] in fset else flist.pop(i) + failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8') + assert len(fset) == 0 and len(flist) == len(failed_set) except: pass - for current_dir, subdirs, files in os.walk(root, topdown=False): - if len(set(current_dir.replace("\\","/").split("/")) & set(escape_folder)) > 0: + if not Path(source_folder).is_dir(): + print('[-]Source folder not found!') + return [] + total = [] + source = Path(source_folder).resolve() + skip_failed_cnt, skip_nfo_days_cnt = 0, 0 + escape_folder_set = set(re.split("[,,]", conf.escape_folder())) + for full_name in source.glob(r'**/*'): + if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set: + continue + if not full_name.suffix.lower() in file_type: + continue + absf = str(full_name) + if absf in failed_set: + skip_failed_cnt += 1 + if debug: + print('[!]Skip failed movie:', absf) + continue + is_sym = full_name.is_symlink() + if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标 + continue # file is symlink or hardlink(Linux/NTFS/Darwin) + # 调试用0字节样本允许通过,去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB) + movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size,直接赋0跳过小视频检测 + if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120 + continue + if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name): + continue + if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days: + skip_nfo_days_cnt += 1 + if debug: + print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'") continue - for f in files: - full_name = os.path.join(current_dir, f) - if not os.path.splitext(full_name)[1].upper() in file_type: - continue - absf = os.path.abspath(full_name) - if absf in failed_set: - if debug: - print('[!]Skip failed file:', absf) - continue - if cliRE and not cliRE.search(absf): - continue - if main_mode == 3 and nfo_skip_days > 0: - nfo = Path(absf).with_suffix('.nfo') - if file_modification_days(nfo) <= nfo_skip_days: - continue - if (main_mode == 3 or not is_link(absf)) and not trailerRE.search(f): - total.append(absf) + total.append(absf) + + if skip_failed_cnt: + print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.") + if skip_nfo_days_cnt: + print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.") if nfo_skip_days <= 0 or not soft_link or main_mode == 3: return total # 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数,跳过N天内更新过的 skip_numbers = set() - success_folder = conf.success_folder() - for current_dir, subdirs, files in os.walk(success_folder, topdown=False): - for f in files: - f_obj = Path(f) - if f_obj.suffix.lower() != '.nfo': - continue - if file_modification_days(Path(current_dir) / f_obj) > nfo_skip_days: - continue - number = get_number(False, f_obj.stem) - if number: - skip_numbers.add(number.upper()) + success_folder = Path(conf.success_folder()).resolve() + for f in success_folder.glob(r'**/*'): + if not re.match(r'\.nfo', f.suffix, re.IGNORECASE): + continue + if file_modification_days(f) > nfo_skip_days: + continue + number = get_number(False, f.stem) + if not number: + continue + skip_numbers.add(number.lower()) + rm_list = [] for f in total: n_number = get_number(False, os.path.basename(f)) - if n_number and n_number.upper() in skip_numbers: + if n_number and n_number.lower() in skip_numbers: rm_list.append(f) for f in rm_list: total.remove(f) + if debug: + print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'") + if len(rm_list): + print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.") + return total def create_failed_folder(failed_folder): - if not os.path.isdir(failed_folder): # 新建failed文件夹 + if not os.path.exists(failed_folder): # 新建failed文件夹 try: os.makedirs(failed_folder) - if not os.path.isdir(failed_folder): - raise except: - print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)") + print(f"[-]Fatal error! Can not make folder '{failed_folder}'") sys.exit(0) @@ -227,24 +377,29 @@ def rm_empty_folder(path): pass -def create_data_and_move(file_path: str, c: config.Config, debug): +def create_data_and_move(file_path: str, zero_op): # Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4 - file_name = os.path.basename(file_path) - n_number = get_number(debug, file_name) + debug = config.getInstance().debug() + n_number = get_number(debug, os.path.basename(file_path)) file_path = os.path.abspath(file_path) if debug == True: - print(f"[!]Making Data for [{file_path}], the number is [{n_number}]") + print(f"[!] [{n_number}] As Number making data for '{file_path}'") + if zero_op: + return if n_number: - core_main(file_path, n_number, c) + core_main(file_path, n_number) else: print("[-] number empty ERROR") + moveFailedFolder(file_path) print("[*]======================================================") else: try: - print(f"[!]Making Data for [{file_path}], the number is [{n_number}]") + print(f"[!] [{n_number}] As Number making data for '{file_path}'") + if zero_op: + return if n_number: - core_main(file_path, n_number, c) + core_main(file_path, n_number) else: raise ValueError("number empty") print("[*]======================================================") @@ -253,22 +408,26 @@ def create_data_and_move(file_path: str, c: config.Config, debug): print('[-]', err) try: - moveFailedFolder(file_path, conf) + moveFailedFolder(file_path) except Exception as err: print('[!]', err) -def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number): +def create_data_and_move_with_custom_number(file_path: str, custom_number): + conf = config.getInstance() file_name = os.path.basename(file_path) try: - print("[!]Making Data for [{}], the number is [{}]".format(file_path, custom_number)) - core_main(file_path, custom_number, c) + print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number)) + if custom_number: + core_main(file_path, custom_number) + else: + print("[-] number empty ERROR") print("[*]======================================================") except Exception as err: print("[-] [{}] ERROR:".format(file_path)) print('[-]', err) - if c.soft_link(): + if conf.soft_link(): print("[-]Link {} to failed folder".format(file_path)) os.symlink(file_path, os.path.join(conf.failed_folder(), file_name)) else: @@ -279,12 +438,26 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu print('[!]', err) -if __name__ == '__main__': +def main(): version = '5.0.1' urllib3.disable_warnings() #Ignore http proxy warning + + # Read config.ini first, in argparse_function() need conf.failed_folder() + conf = config.Config("config.ini") + # Parse command line args - single_file_path, folder_path, custom_number, auto_exit, logdir, regexstr = argparse_function(version) + single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version) + main_mode = conf.main_mode() + if not main_mode in (1, 2, 3): + print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.") + sys.exit(4) + + signal.signal(signal.SIGINT, signal_handler) + if sys.platform == 'win32': + signal.signal(signal.SIGBREAK, sigdebug_handler) + else: + signal.signal(signal.SIGWINCH, sigdebug_handler) dupe_stdout_to_logfile(logdir) print('[*]================== AV Data Capture ===================') @@ -293,55 +466,62 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu print('[*]======================================================') print('[*]严禁在墙内宣传本项目') - # Read config.ini - conf = config.Config("config.ini") - + start_time = time.time() + print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S")) if conf.update_check(): check_update(version) + print(f"[+]Load Config file '{conf.ini_path}'.") if conf.debug(): print('[+]Enable debug') if conf.soft_link(): print('[!]Enable soft link') - #print('[!]CmdLine:'," ".join(sys.argv[1:])) + if len(sys.argv)>1: + print('[!]CmdLine:'," ".join(sys.argv[1:])) + print('[+]Main Working mode ## {}: {} ## {}{}{}' + .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1], + "" if not conf.multi_threading() else ", multi_threading on", + "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}", + "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}" + ) if not single_file_path else ('-','Single File', '','','')) + ) create_failed_folder(conf.failed_folder()) - start_time = time.time() if not single_file_path == '': #Single File print('[+]==================== Single File =====================') if custom_number == '': - create_data_and_move_with_custom_number(single_file_path, conf, get_number(conf.debug(), os.path.basename(single_file_path))) + create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path))) else: - create_data_and_move_with_custom_number(single_file_path, conf, custom_number) + create_data_and_move_with_custom_number(single_file_path, custom_number) else: - if folder_path == '': + folder_path = conf.source_folder() + if not isinstance(folder_path, str) or folder_path == '': folder_path = os.path.abspath(".") - movie_list = movie_lists(folder_path, conf, regexstr) + movie_list = movie_lists(folder_path, regexstr) count = 0 count_all = str(len(movie_list)) - print('[+]Find', count_all, 'movies. Start at', time.strftime("%Y-%m-%d %H:%M:%S")) - main_mode = conf.main_mode() + print('[+]Find', count_all, 'movies.') + print('[*]======================================================') stop_count = conf.stop_counter() if stop_count<1: stop_count = 999999 else: count_all = str(min(len(movie_list), stop_count)) - if main_mode == 3: - print(f'[!]运行模式:**维护模式**,本程序将在处理{count_all}个视频文件后停止,如需后台执行自动退出请结合 -a 参数。') + for movie_path in movie_list: # 遍历电影列表 交给core处理 count = count + 1 percentage = str(count / int(count_all) * 100)[:4] + '%' - print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -') - create_data_and_move(movie_path, conf, conf.debug()) + print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S"))) + create_data_and_move(movie_path, zero_op) if count >= stop_count: print("[!]Stop counter triggered!") break - if conf.del_empty_folder(): + if conf.del_empty_folder() and not zero_op: rm_empty_folder(conf.success_folder()) rm_empty_folder(conf.failed_folder()) if len(folder_path): @@ -353,9 +533,15 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu " End at", time.strftime("%Y-%m-%d %H:%M:%S")) print("[+]All finished!!!") - if not (conf.auto_exit() or auto_exit): - input("Press enter key exit, you can check the error message before you exit...") close_logfile(logdir) + if not conf.auto_exit(): + input("Press enter key exit, you can check the error message before you exit...") + sys.exit(0) + +import multiprocessing +if __name__ == '__main__': + multiprocessing.freeze_support() + main() diff --git a/Makefile b/Makefile index 407aa4b00..4c8960aa8 100644 --- a/Makefile +++ b/Makefile @@ -16,7 +16,9 @@ make: #export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1) @echo "[+]Pyinstaller make" - pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "Img:Img" + pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ @echo "[+]Move to bin" if [ ! -d "./bin" ];then mkdir bin; fi diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py index e1608b661..039fed0c5 100644 --- a/WebCrawler/__init__.py +++ b/WebCrawler/__init__.py @@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测 return True -def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数据 +def get_data_from_json(file_number): # 从JSON返回元数据 """ iterate through all services and fetch the data """ @@ -53,6 +53,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 "fc2club": fc2club.main } + conf = config.getInstance() # default fetch order list, from the beginning to the end sources = conf.sources().split(',') if not len(conf.sources()) > 80: @@ -114,6 +115,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get()) # if any service return a valid return, break if get_data_state(json_data): + print(f"[+]Find movie [{file_number}] metadata on website '{source}'") break pool.close() pool.terminate() @@ -125,6 +127,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 json_data = json.loads(func_mapping[source](file_number)) # if any service return a valid return, break if get_data_state(json_data): + print(f"[+]Find movie [{file_number}] metadata on website '{source}'") break except: break @@ -134,6 +137,14 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 print('[-]Movie Number not found!') return None + # 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障 + # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z + # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z + # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。 + if str(json_data.get('number')).upper() != file_number.upper(): + print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number')))) + return None + # ================================================网站规则添加结束================================================ title = json_data.get('title') @@ -167,6 +178,10 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 imagecut = json_data.get('imagecut') tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @ + while 'XXXX' in tag: + tag.remove('XXXX') + while 'xxx' in tag: + tag.remove('xxx') actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '') if title == '' or number == '': @@ -225,6 +240,8 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数 studio = studio.replace('エムズビデオグループ','M’s Video Group') studio = studio.replace('ミニマム','Minimum') studio = studio.replace('ワープエンタテインメント','WAAP Entertainment') + studio = studio.replace('pacopacomama,パコパコママ','pacopacomama') + studio = studio.replace('パコパコママ','pacopacomama') studio = re.sub('.*/妄想族','妄想族',studio) studio = studio.replace('/',' ') # === 替换Studio片假名 END @@ -293,4 +310,7 @@ def special_characters_replacement(text) -> str: replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane - replace('|', 'ǀ')) # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane + replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK + replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK + replace('&', '&')) diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py index 59254217f..f7b144ce8 100644 --- a/WebCrawler/airav.py +++ b/WebCrawler/airav.py @@ -6,6 +6,7 @@ from bs4 import BeautifulSoup#need install import json from ADC_function import * +from WebCrawler import javbus ''' API @@ -17,95 +18,94 @@ host = 'https://www.airav.wiki' # airav这个网站没有演员图片,所以直接使用javbus的图 -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'star-name'}) - d={} - for i in a: - l=i.a['href'] - t=i.get_text() - html = etree.fromstring(get_html(l), etree.HTMLParser()) - p=urljoin("https://www.javbus.com", - str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) - p2={t:p} - d.update(p2) - return d +def getActorPhoto(javbus_json): + result = javbus_json.get('actor_photo') + if isinstance(result, dict) and len(result): + return result + return '' def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - # h5:first-child定位第一个h5标签,妈的找了好久才找到这个语法 - title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-') - try: - title2 = re.sub('n\d+-','',title) - - return title2 - except: - return title - -def getStudio(htmlcode): #获取厂商 已修改 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - # 如果记录中冇导演,厂商排在第4位 - if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - # 如果记录中有导演,厂商排在第5位 - elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - else: - result = '' + html = etree.fromstring(htmlcode, etree.HTMLParser()) + title = str(html.xpath('/html/head/title/text()')[0]) + result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip() return result -def getYear(htmlcode): #获取年份 + +def getStudio(htmlcode, javbus_json): #获取厂商 已修改 + # javbus如果有数据以它为准 + result = javbus_json.get('studio') + if isinstance(result, str) and len(result): + return result html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) - image = doc('a.bigImage') - return urljoin("https://www.javbus.com", image.attr('href')) -def getRelease(htmlcode): #获取出版日期 + return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']") +def getYear(htmlcode, javbus_json): #获取年份 + result = javbus_json.get('year') + if isinstance(result, str) and len(result): + return result + release = getRelease(htmlcode, javbus_json) + if len(release) != len('2000-01-01'): + return '' + return release[:4] +def getCover(htmlcode, javbus_json): #获取封面图片 + result = javbus_json.get('cover') + if isinstance(result, str) and len(result): + return result html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(htmlcode): #获取分钟 已修改 + return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0] +def getRelease(htmlcode, javbus_json): #获取出版日期 + result = javbus_json.get('release') + if isinstance(result, str) and len(result): + return result html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") + try: + result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group() + except: + return '' return result -def getActor(htmlcode): #获取女优 +def getRuntime(javbus_json): #获取播放时长 + result = javbus_json.get('runtime') + if isinstance(result, str) and len(result): + return result + return '' +# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先 +def getActor(htmlcode, javbus_json): #获取女优 b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.get_text()) - return b -def getNum(htmlcode): #获取番号 html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 已修改 + a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()') + for v in a: + v = v.strip() + if len(v): + b.append(v) + if len(b): + return b + result = javbus_json.get('actor') + if isinstance(result, list) and len(result): + return result + return [] +def getNum(htmlcode, javbus_json): #获取番号 + result = javbus_json.get('number') + if isinstance(result, str) and len(result): + return result html = etree.fromstring(htmlcode, etree.HTMLParser()) - if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - else: - result = '' # 记录中有可能没有导演数据 + title = str(html.xpath('/html/head/title/text()')[0]) + result = str(re.findall('^\[(.*?)]', title)[0]) return result - -def getOutline(htmlcode): #获取演员 +def getDirector(javbus_json): #获取导演 已修改 + result = javbus_json.get('director') + if isinstance(result, str) and len(result): + return result + return '' +def getOutline(htmlcode): #获取概述 html = etree.fromstring(htmlcode, etree.HTMLParser()) try: - result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','') + result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip() return result except: return '' -def getSerise(htmlcode): #获取系列 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - # 如果记录中冇导演,系列排在第6位 - if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") - # 如果记录中有导演,系列排在第7位 - elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - else: - result = '' - return result +def getSerise(javbus_json): #获取系列 已修改 + result = javbus_json.get('series') + if isinstance(result, str) and len(result): + return result + return '' def getTag(htmlcode): # 获取标签 tag = [] soup = BeautifulSoup(htmlcode, 'lxml') @@ -169,52 +169,50 @@ def main(number): try: try: htmlcode = get_html('https://cn.airav.wiki/video/' + number) - javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number) - + javbus_json = json.loads(javbus.main(number)) except: print(number) dic = { # 标题可使用airav - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), - # 制作商选择使用javbus - 'studio': getStudio(javbus_htmlcode), - # 年份也是用javbus - 'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()), + 'title': getTitle(htmlcode), + # 制作商先找javbus,如果没有再找本站 + 'studio': getStudio(htmlcode, javbus_json), + # 年份先试javbus,如果没有再找本站 + 'year': getYear(htmlcode, javbus_json), # 简介 使用 airav 'outline': getOutline(htmlcode), # 使用javbus - 'runtime': getRuntime(javbus_htmlcode), + 'runtime': getRuntime(javbus_json), # 导演 使用javbus - 'director': getDirector(javbus_htmlcode), - # 作者 使用airav - 'actor': getActor(javbus_htmlcode), - # 发售日使用javbus - 'release': getRelease(javbus_htmlcode), + 'director': getDirector(javbus_json), + # 演员 先试airav + 'actor': getActor(htmlcode, javbus_json), + # 发售日先试javbus + 'release': getRelease(htmlcode, javbus_json), # 番号使用javbus - 'number': getNum(javbus_htmlcode), + 'number': getNum(htmlcode, javbus_json), # 封面链接 使用javbus - 'cover': getCover(javbus_htmlcode), + 'cover': getCover(htmlcode, javbus_json), # 剧照获取 'extrafanart': getExtrafanart(htmlcode), 'imagecut': 1, # 使用 airav 'tag': getTag(htmlcode), # 使用javbus - 'label': getSerise(javbus_htmlcode), + 'label': getSerise(javbus_json), # 妈的,airav不提供作者图片 - 'actor_photo': getActorPhoto(javbus_htmlcode), - +# 'actor_photo': getActorPhoto(javbus_json), 'website': 'https://www.airav.wiki/video/' + number, 'source': 'airav.py', # 使用javbus - 'series': getSerise(javbus_htmlcode), + 'series': getSerise(javbus_json) } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') return js except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", @@ -226,6 +224,6 @@ def main(number): if __name__ == '__main__': - #print(main('ADN-188')) - print(main('ADN-188')) - print(main('CJOD-278')) + print(main('ADV-R0624')) # javbus页面返回404, airav有数据 + print(main('ADN-188')) # 一人 + print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字 diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index 254f3e887..e38a452d5 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -3,50 +3,42 @@ import re from lxml import etree import json -from bs4 import BeautifulSoup from ADC_function import * -# import sys +from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'avatar-box'}) +def getActorPhoto(html): + a = html.xpath('//a[@class="avatar-box"]') d = {} for i in a: - l = i.img['src'] - t = i.span.get_text() + l = i.find('.//img').attrib['src'] + t = i.find('span').text p2 = {t: l} d.update(p2) return d -def getTitle(a): +def getTitle(html): try: - html = etree.fromstring(a, etree.HTMLParser()) result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] return result.replace('/', '') except: return '' -def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() - soup = BeautifulSoup(a, 'lxml') - a = soup.find_all(attrs={'class': 'avatar-box'}) +def getActor(html): + a = html.xpath('//a[@class="avatar-box"]') d = [] for i in a: - d.append(i.span.get_text()) + d.append(i.find('span').text) return d -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getStudio(html): result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') return result1 -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRuntime(html): result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") return result1 -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getLabel(html): result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") return result1 -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getNum(html): result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") return result1 def getYear(release): @@ -55,28 +47,20 @@ def getYear(release): return result except: return release -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRelease(html): result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") return result1 -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getCover(html): result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") return result -def getCover_small(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getCover_small(html): result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") return result -def getTag(a): # 获取演员 - soup = BeautifulSoup(a, 'lxml') - a = soup.find_all(attrs={'class': 'genre'}) - d = [] - for i in a: - d.append(i.get_text()) - return d -def getSeries(htmlcode): +def getTag(html): + x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + return x[2:] if len(x) > 2 else [] +def getSeries(html): try: - html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") return result1 except: @@ -86,42 +70,45 @@ def main(number): html = get_html('https://tellme.pw/avsox') site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0] a = get_html(site + '/cn/search/' + number) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html(site + '/cn/search/' + number.replace('-', '_')) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") if result1 == '' or result1 == 'null' or result1 == 'None': a = get_html(site + '/cn/search/' + number.replace('_', '')) - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + html = etree.fromstring(a, etree.HTMLParser()) result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") - web = get_html("https:" + result1) - soup = BeautifulSoup(web, 'lxml') - info = str(soup.find(attrs={'class': 'row movie'})) + detail = get_html("https:" + result1) + lx = etree.fromstring(detail, etree.HTMLParser()) try: + new_number = getNum(lx) + if new_number.upper() != number.upper(): + raise ValueError('number not found') + title = getTitle(lx).strip(new_number) dic = { - 'actor': getActor(web), - 'title': getTitle(web).strip(getNum(web)), - 'studio': getStudio(info), - 'outline': '', # - 'runtime': getRuntime(info), + 'actor': getActor(lx), + 'title': title, + 'studio': getStudio(lx), + 'outline': getStoryline(number, title), + 'runtime': getRuntime(lx), 'director': '', # - 'release': getRelease(info), - 'number': getNum(info), - 'cover': getCover(web), - 'cover_small': getCover_small(a), + 'release': getRelease(lx), + 'number': new_number, + 'cover': getCover(lx), + 'cover_small': getCover_small(html), 'imagecut': 3, - 'tag': getTag(web), - 'label': getLabel(info), - 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(web), + 'tag': getTag(lx), + 'label': getLabel(lx), + 'year': getYear(getRelease(lx)), + 'actor_photo': getActorPhoto(lx), 'website': "https:" + result1, 'source': 'avsox.py', - 'series': getSeries(info), + 'series': getSeries(lx), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -129,3 +116,4 @@ def main(number): if __name__ == "__main__": print(main('012717_472')) + print(main('1')) # got fake result raise 'number not found' diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py index 8eee1aff6..790b91039 100755 --- a/WebCrawler/carib.py +++ b/WebCrawler/carib.py @@ -1,51 +1,56 @@ import sys sys.path.append('../') import json -from bs4 import BeautifulSoup from lxml import html import re from ADC_function import * +from WebCrawler.storyline import getStoryline def main(number: str) -> json: try: - caribbytes, browser = get_html_by_browser( - 'https://www.caribbeancom.com/moviepages/'+number+'/index.html', - return_type="browser") - - if not caribbytes or not caribbytes.ok: + # 因演员图片功能还未使用,为提速暂时注释,改为用get_html() + #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html', + # return_type='browser') + #if not r.ok: + # raise ValueError("page not found") + #htmlcode = str(browser.page) + htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content') + htmlcode = htmlbyte.decode('euc-jp') + if not htmlcode or '404' in htmlcode or 'class="movie-info section"' not in htmlcode: raise ValueError("page not found") - lx = html.fromstring(str(browser.page)) + lx = html.fromstring(htmlcode) + title = get_title(lx) + + dic = { + 'title': title, + 'studio': '加勒比', + 'year': get_year(lx), + 'outline': get_outline(lx, number, title), + 'runtime': get_runtime(lx), + 'director': '', + 'actor': get_actor(lx), + 'release': get_release(lx), + 'number': number, + 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', + 'tag': get_tag(lx), + 'extrafanart': get_extrafanart(lx), + 'label': get_series(lx), + 'imagecut': 1, +# 'actor_photo': get_actor_photo(browser), + 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', + 'source': 'carib.py', + 'series': get_series(lx), + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) + return js - if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"): - raise ValueError("page info not found") except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':')) - dic = { - 'title': get_title(lx), - 'studio': '加勒比', - 'year': get_year(lx), - 'outline': get_outline(lx), - 'runtime': get_runtime(lx), - 'director': '', - 'actor': get_actor(lx), - 'release': get_release(lx), - 'number': number, - 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg', - 'tag': get_tag(lx), - 'extrafanart': get_extrafanart(lx), - 'label': get_series(lx), - 'imagecut': 1, -# 'actor_photo': get_actor_photo(browser), - 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html', - 'source': 'carib.py', - 'series': get_series(lx), - } - js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) - return js + def get_title(lx: html.HtmlElement) -> str: return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip() @@ -53,8 +58,12 @@ def get_title(lx: html.HtmlElement) -> str: def get_year(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4] -def get_outline(lx: html.HtmlElement) -> str: - return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() +def get_outline(lx: html.HtmlElement, number: str, title: str) -> str: + o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip() + g = getStoryline(number, title) + if len(g): + return g + return o def get_release(lx: html.HtmlElement) -> str: return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-') @@ -114,11 +123,10 @@ def get_actor_photo(browser): if pos<0: continue css = html[pos:pos+100] - p0 = css.find('background: url(') - p1 = css.find('.jpg)') - if p0<0 or p1<0: + cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I) + if not cssBGjpgs or not len(cssBGjpgs[0]): continue - p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])} + p = {k: urljoin(browser.url, cssBGjpgs[0])} o.update(p) return o diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py index 066e04f80..d22cdb170 100644 --- a/WebCrawler/dlsite.py +++ b/WebCrawler/dlsite.py @@ -153,7 +153,7 @@ def main(number): js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index e6ae516ad..0a51fdc0b 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -93,10 +93,11 @@ def main(number): actor = '素人' lx = etree.fromstring(htmlcode2, etree.HTMLParser()) cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']") + cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover) dic = { 'title': lx.xpath('/html/head/title/text()')[0], 'studio': getStudio_fc2com(htmlcode2), - 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), + 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), 'outline': '', # getOutline_fc2com(htmlcode2), 'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]), 'director': getStudio_fc2com(htmlcode2), @@ -116,7 +117,7 @@ def main(number): 'series': '', } except Exception as e: - if ADC_function.config.Config().debug(): + if ADC_function.config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -124,4 +125,5 @@ def main(number): if __name__ == '__main__': print(main('FC2-1787685')) + print(main('FC2-2086710')) diff --git a/WebCrawler/fc2club.py b/WebCrawler/fc2club.py index 7d0fac6fb..df14b3b6a 100644 --- a/WebCrawler/fc2club.py +++ b/WebCrawler/fc2club.py @@ -84,7 +84,7 @@ def main(number): dic = { 'title': getTitle_fc2com(htmlcode2), 'studio': getStudio_fc2com(htmlcode2), - 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), + 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)), 'outline': '', # getOutline_fc2com(htmlcode2), 'runtime': '', 'director': getStudio_fc2com(htmlcode2), @@ -103,7 +103,7 @@ def main(number): 'series': '', } except Exception as e: - if ADC_function.config.Config().debug(): + if ADC_function.config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 7446ef3fe..786605274 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -1,114 +1,76 @@ import sys sys.path.append('../') import re -from pyquery import PyQuery as pq#need install from lxml import etree#need install -from bs4 import BeautifulSoup#need install import json from ADC_function import * -from WebCrawler import fanza -from WebCrawler import airav +from WebCrawler.storyline import getStoryline -def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'star-name'}) +def getActorPhoto(html): + actors = html.xpath('//div[@class="star-name"]/a') d={} - for i in a: - l=i.a['href'] - t=i.get_text() - html = etree.fromstring(get_html(l), etree.HTMLParser()) + for i in actors: + url=i.attrib['href'] + t=i.attrib['title'] + html = etree.fromstring(get_html(url), etree.HTMLParser()) p=urljoin("https://www.javbus.com", str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']")) p2={t:p} d.update(p2) return d -def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - title=str(doc('div.container h3').text()).replace(' ','-') - try: - title2 = re.sub('n\d+-','',title) - return title2 - except: - return title -def getStudio(htmlcode): #获取厂商 已修改 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - # 如果记录中冇导演,厂商排在第4位 - if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - # 如果记录中有导演,厂商排在第5位 - elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - else: - result = '' - return result -def getYear(htmlcode): #获取年份 - html = etree.fromstring(htmlcode,etree.HTMLParser()) +def getTitle(html): #获取标题 + title = str(html.xpath('/html/head/title/text()')[0]) + title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip() + return title +def getStudioJa(html): + x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getStudio(html): #获取厂商 + x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getYear(html): #获取年份 + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip() + return result[:4] if len(result)>=len('2000-01-01') else '' +def getCover(html): #获取封面链接 + image = str(html.xpath('//a[@class="bigImage"]/@href')[0]) + return urljoin("https://www.javbus.com", image) +def getRelease(html): #获取出版日期 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") return result -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) - image = doc('a.bigImage') - return urljoin("https://www.javbus.com", image.attr('href')) -def getRelease(htmlcode): #获取出版日期 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(htmlcode): #获取分钟 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getRuntime(html): #获取分钟 已修改 result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘") return result -def getActor(htmlcode): #获取女优 +def getActor(html): #获取女优 b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.get_text()) + actors = html.xpath('//div[@class="star-name"]/a') + for i in actors: + b.append(i.attrib['title']) return b -def getNum(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - else: - result = '' # 记录中有可能没有导演数据 - return result -def getCID(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) - #print(htmlcode) +def getNum(html): #获取番号 + kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + return kwdlist[0] +def getDirectorJa(html): + x = html.xpath('//span[contains(text(),"監督:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getDirector(html): #获取导演 + x = html.xpath('//span[contains(text(),"導演:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getCID(html): string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','') result = re.sub('/.*?.jpg','',string) return result -def getOutline(number): #获取剧情介绍 - try: - response = json.loads(airav.main(number)) - result = response['outline'] - return result - except: - return '' -def getSerise(htmlcode): #获取系列 已修改 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - # 如果记录中冇导演,系列排在第6位 - if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']") - # 如果记录中有导演,系列排在第7位 - elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"): - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']") - else: - result = '' - return result -def getTag(htmlcode): # 获取标签 - tag = [] - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find_all(attrs={'class': 'genre'}) - for i in a: - if 'onmouseout' in str(i) or '多選提交' in str(i): - continue - tag.append(translateTag_to_sc(i.get_text())) - return tag - +def getOutline(number, title): #获取剧情介绍 多进程并发查询 + return getStoryline(number,title) +def getSeriseJa(html): + x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getSerise(html): #获取系列 + x = html.xpath('//span[contains(text(),"系列:")]/../a/text()') + return str(x[0]) if len(x) else '' +def getTag(html): # 获取标签 + klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + taglist = [translateTag_to_sc(v) for v in klist[1:]] + return taglist def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'<div id=\"sample-waterfall\">[\s\S]*?</div></a>\s*?</div>') html = html_pather.search(htmlcode) @@ -117,32 +79,34 @@ def getExtrafanart(htmlcode): # 获取剧照 extrafanart_pather = re.compile(r'<a class=\"sample-box\" href=\"(.*?)\"') extrafanart_imgs = extrafanart_pather.findall(html) if extrafanart_imgs: - return extrafanart_imgs + return [urljoin('https://www.javbus.com',img) for img in extrafanart_imgs] return '' def main_uncensored(number): htmlcode = get_html('https://www.javbus.com/ja/' + number) - if getTitle(htmlcode) == '': - htmlcode = get_html('https://www.javbus.com/ja/' + number.replace('-','_')) + if "<title>404 Page Not Found" in htmlcode: + raise Exception('404 page not found') + lx = etree.fromstring(htmlcode, etree.HTMLParser()) + title = getTitle(lx) dic = { - 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''), - 'studio': getStudio(htmlcode), - 'year': getYear(htmlcode), - 'outline': getOutline(number), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), - 'tag': getTag(htmlcode), + 'title': title, + 'studio': getStudioJa(lx), + 'year': getYear(lx), + 'outline': getOutline(number, title), + 'runtime': getRuntime(lx), + 'director': getDirectorJa(lx), + 'actor': getActor(lx), + 'release': getRelease(lx), + 'number': getNum(lx), + 'cover': getCover(lx), + 'tag': getTag(lx), 'extrafanart': getExtrafanart(htmlcode), - 'label': getSerise(htmlcode), + 'label': getSeriseJa(lx), 'imagecut': 0, - 'actor_photo': '', +# 'actor_photo': '', 'website': 'https://www.javbus.com/ja/' + number, 'source': 'javbus.py', - 'series': getSerise(htmlcode), + 'series': getSeriseJa(lx), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') return js @@ -155,32 +119,36 @@ def main(number): htmlcode = get_html('https://www.fanbus.us/' + number) except: htmlcode = get_html('https://www.javbus.com/' + number) + if "<title>404 Page Not Found" in htmlcode: + raise Exception('404 page not found') + lx = etree.fromstring(htmlcode,etree.HTMLParser()) + title = getTitle(lx) dic = { - 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))), - 'studio': getStudio(htmlcode), - 'year': str(re.search('\d{4}', getYear(htmlcode)).group()), - 'outline': getOutline(number), - 'runtime': getRuntime(htmlcode), - 'director': getDirector(htmlcode), - 'actor': getActor(htmlcode), - 'release': getRelease(htmlcode), - 'number': getNum(htmlcode), - 'cover': getCover(htmlcode), + 'title': title, + 'studio': getStudio(lx), + 'year': getYear(lx), + 'outline': getOutline(number, title), + 'runtime': getRuntime(lx), + 'director': getDirector(lx), + 'actor': getActor(lx), + 'release': getRelease(lx), + 'number': getNum(lx), + 'cover': getCover(lx), 'imagecut': 1, - 'tag': getTag(htmlcode), + 'tag': getTag(lx), 'extrafanart': getExtrafanart(htmlcode), - 'label': getSerise(htmlcode), - 'actor_photo': getActorPhoto(htmlcode), + 'label': getSerise(lx), +# 'actor_photo': getActorPhoto(lx), 'website': 'https://www.javbus.com/' + number, 'source': 'javbus.py', - 'series': getSerise(htmlcode), + 'series': getSerise(lx), } js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8') return js except: return main_uncensored(number) except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) data = { "title": "", @@ -191,5 +159,13 @@ def main(number): return js if __name__ == "__main__" : + config.G_conf_override['debug_mode:switch'] = True + print(main('ABP-888')) + print(main('ABP-960')) + print(main('ADV-R0624')) # 404 + print(main('MMNT-010')) print(main('ipx-292')) print(main('CEMD-011')) + print(main('CJOD-278')) + print(main('100221_001')) + print(main('AVSW-061')) diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py index ecc4f3637..e4e803c2d 100755 --- a/WebCrawler/javdb.py +++ b/WebCrawler/javdb.py @@ -3,25 +3,22 @@ import re from lxml import etree import json -from bs4 import BeautifulSoup from ADC_function import * -from WebCrawler import airav -# import sys +from mechanicalsoup.stateful_browser import StatefulBrowser +from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getTitle(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getTitle(html): browser_title = str(html.xpath("/html/head/title/text()")[0]) return browser_title[:browser_title.find(' | JavDB')].strip() -def getActor(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getActor(html): actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()') genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class') r = [] idx = 0 - actor_gendor = config.Config().actor_gender() + actor_gendor = config.getInstance().actor_gender() if not actor_gendor in ['female','male','both','all']: actor_gendor = 'female' for act in actors: @@ -33,8 +30,8 @@ def getActor(a): idx = idx + 1 return r -def getaphoto(url): - html_page = get_html(url) +def getaphoto(url, browser): + html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url) img_prether = re.compile(r'<span class\=\"avatar\" style\=\"background\-image\: url\((.*?)\)') img_url = img_prether.findall(html_page) if img_url: @@ -42,24 +39,18 @@ def getaphoto(url): else: return '' -def getActorPhoto(html): #//*[@id="star_qdt"]/li/a/img - actorall_prether = re.compile(r'<strong>演員\:</strong>\s*?.*?<span class=\"value\">(.*)\s*?</div>') - actorall = actorall_prether.findall(html) - - if actorall: - actoralls = actorall[0] - actor_prether = re.compile(r'<a href\=\"(.*?)\">(.*?)</a>') - actor = actor_prether.findall(actoralls) - actor_photo = {} - for i in actor: - actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0]) - - return actor_photo - - else: +def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img + actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]') + if not actorall: return {} + a = getActor(html) + actor_photo = {} + for i in actorall: + if i.text in a: + actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser) + return actor_photo -def getStudio(a): +def getStudio(a, html): # html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() # result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']") # result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']") @@ -67,23 +58,25 @@ def getStudio(a): patherr = re.compile(r'<strong>片商\:</strong>[\s\S]*?<a href=\".*?>(.*?)</a></span>') pianshang = patherr.findall(a) if pianshang: - result = pianshang[0] - else: - result = "" + result = pianshang[0].strip() + if len(result): + return result + # 以卖家作为工作室 + try: + result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']") + except: + result = '' return result -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRuntime(html): result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').rstrip('mi') -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getLabel(html): result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getNum(html): result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']") return str(result2 + result1).strip('+') @@ -113,8 +106,7 @@ def getRelease(a): else: result = '' return result -def getTag(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getTag(html): try: result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()') total = [] @@ -135,11 +127,10 @@ def getTag(a): pass return total -def getCover_small(a, index=0): +def getCover_small(html, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() try: result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: @@ -170,66 +161,76 @@ def getTrailer(htmlcode): # 获取预告片 video_url = '' return video_url -def getExtrafanart(htmlcode): # 获取剧照 - html_pather = re.compile(r'<div class=\"tile\-images preview\-images\">[\s\S]*?</a>\s+?</div>\s+?</div>') - html = html_pather.search(htmlcode) - if html: - html = html.group() - extrafanart_pather = re.compile(r'<a class="tile-item" href=\"(.*?)\"') - extrafanart_imgs = extrafanart_pather.findall(html) - if extrafanart_imgs: - return extrafanart_imgs - return '' - -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getExtrafanart(html): # 获取剧照 + result = [] + try: + result = html.xpath("//article[@class='message video-panel']/div[@class='message-body']/div[@class='tile-images preview-images']/a[contains(@href,'/samples/')]/@href") + except: + pass + return result +def getCover(html): try: result = html.xpath("//div[contains(@class, 'column-video-cover')]/a/img/@src")[0] except: # 2020.7.17 Repair Cover Url crawl result = html.xpath("//div[contains(@class, 'column-video-cover')]/img/@src")[0] return result -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getDirector(html): result1 = str(html.xpath('//strong[contains(text(),"導演")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"導演")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') -def getOutline(number): #获取剧情介绍 +def getOutline0(number): #获取剧情介绍 airav.wiki站点404,函数暂时更名,等无法恢复时删除 try: - response = json.loads(airav.main(number)) - result = response['outline'] + htmlcode = get_html('https://cn.airav.wiki/video/' + number) + from WebCrawler.airav import getOutline as airav_getOutline + result = airav_getOutline(htmlcode) return result except: - return '' -def getSeries(a): - #/html/body/section/div/div[3]/div[2]/nav/div[7]/span/a - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() + pass + return '' +def getOutline(number, title): #获取剧情介绍 多进程并发查询 + return getStoryline(number,title) +def getSeries(html): result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']") result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']") return str(result1 + result2).strip('+').replace("', '", '').replace('"', '') def main(number): - javdb_site = random.choice(["javdb9", "javdb30"]) + # javdb更新后同一时间只能登录一个数字站,最新登录站会踢出旧的登录,因此按找到的第一个javdb*.json文件选择站点, + # 如果无.json文件或者超过有效期,则随机选择一个站点。 + javdb_sites = ["javdb31", "javdb32"] + debug = config.getInstance().debug() try: # if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number).group(): # pass # else: # number = number.upper() number = number.upper() - cookie_json = './' + javdb_site + '.json' javdb_cookies = {'over18':'1', 'theme':'auto', 'locale':'zh'} # 不加载过期的cookie,javdb登录界面显示为7天免登录,故假定cookie有效期为7天 - cookies_dict, cookies_filepath = load_cookies(cookie_json) - if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str): - cdays = file_modification_days(cookies_filepath) - if cdays < 7: - javdb_cookies = cookies_dict - elif cdays != 9999: - print( -f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') - + has_json = False + for cj in javdb_sites: + javdb_site = cj + cookie_json = javdb_site + '.json' + cookies_dict, cookies_filepath = load_cookies(cookie_json) + if isinstance(cookies_dict, dict) and isinstance(cookies_filepath, str): + cdays = file_modification_days(cookies_filepath) + if cdays < 7: + javdb_cookies = cookies_dict + has_json = True + break + elif cdays != 9999: + print(f'[!]Cookies file {cookies_filepath} was updated {cdays} days ago, it will not be used for HTTP requests.') + if not has_json: + javdb_site = secrets.choice(javdb_sites) + if debug: + print(f'[!]javdb:select site {javdb_site}') + browser = None try: javdb_url = 'https://' + javdb_site + '.com/search?q=' + number + '&f=all' - query_result = get_html(javdb_url, cookies=javdb_cookies) + res, browser = get_html_by_browser(javdb_url, cookies=javdb_cookies, return_type='browser') + if not res.ok: + raise + query_result = res.text except: query_result = get_html('https://javdb.com/search?q=' + number + '&f=all', cookies=javdb_cookies) html = etree.fromstring(query_result, etree.HTMLParser()) # //table/tr[1]/td[1]/text() @@ -250,61 +251,74 @@ def main(number): raise ValueError("number not found") correct_url = urls[0] try: - javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url - detail_page = get_html(javdb_detail_url, cookies=javdb_cookies) + if isinstance(browser, StatefulBrowser): # get faster benefit from http keep-alive + detail_page = browser.open_relative(correct_url).text + else: + javdb_detail_url = 'https://' + javdb_site + '.com' + correct_url + detail_page = get_html(javdb_detail_url, cookies=javdb_cookies) except: detail_page = get_html('https://javdb.com' + correct_url, cookies=javdb_cookies) + # etree.fromstring开销很大,最好只用一次,而它的xpath很快,比bs4 find/select快,可以多用 + lx = etree.fromstring(detail_page, etree.HTMLParser()) # no cut image by default imagecut = 3 # If gray image exists ,then replace with normal cover if re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', number): - cover_small = getCover_small(query_result) + cover_small = getCover_small(html) else: try: - cover_small = getCover_small(query_result, index=ids.index(number)) + cover_small = getCover_small(html, index=ids.index(number)) except: # if input number is "STAR438" not "STAR-438", use first search result. - cover_small = getCover_small(query_result) + cover_small = getCover_small(html) if 'placeholder' in cover_small: # replace wit normal cover and cut it imagecut = 1 - cover_small = getCover(detail_page) + cover_small = getCover(lx) - dp_number = getNum(detail_page) + dp_number = getNum(lx) if dp_number.upper() != number: raise ValueError("number not found") - title = getTitle(detail_page) + title = getTitle(lx) if title and dp_number: number = dp_number # remove duplicate title title = title.replace(number, '').strip() dic = { - 'actor': getActor(detail_page), + 'actor': getActor(lx), 'title': title, - 'studio': getStudio(detail_page), - 'outline': getOutline(number), - 'runtime': getRuntime(detail_page), - 'director': getDirector(detail_page), + 'studio': getStudio(detail_page, lx), + 'outline': getOutline(number, title), + 'runtime': getRuntime(lx), + 'director': getDirector(lx), 'release': getRelease(detail_page), 'number': number, - 'cover': getCover(detail_page), + 'cover': getCover(lx), 'cover_small': cover_small, 'trailer': getTrailer(detail_page), - 'extrafanart': getExtrafanart(detail_page), + 'extrafanart': getExtrafanart(lx), 'imagecut': imagecut, - 'tag': getTag(detail_page), - 'label': getLabel(detail_page), + 'tag': getTag(lx), + 'label': getLabel(lx), 'year': getYear(detail_page), # str(re.search('\d{4}',getRelease(a)).group()), - 'actor_photo': getActorPhoto(detail_page), +# 'actor_photo': getActorPhoto(lx, javdb_site, browser), 'website': 'https://javdb.com' + correct_url, 'source': 'javdb.py', - 'series': getSeries(detail_page), + 'series': getSeries(lx), } + if not dic['actor'] and re.match(r'FC2-[\d]+', number, re.A): + dic['actor'].append('素人') + if not dic['series']: + dic['series'] = dic['studio'] + if not dic['label']: + dic['label'] = dic['studio'] + + except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') @@ -313,10 +327,18 @@ def main(number): # main('DV-1562') # input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。") if __name__ == "__main__": + config.G_conf_override['debug_mode:switch'] = True # print(main('blacked.20.05.30')) # print(main('AGAV-042')) # print(main('BANK-022')) - print(main('FC2-735670')) - print(main('FC2-1174949')) # not found + # print(main('070116-197')) + # print(main('093021_539')) # 没有剧照 片商pacopacomama + # print(main('FC2-2278260')) + # print(main('FC2-735670')) + # print(main('FC2-1174949')) # not found print(main('MVSD-439')) - print(main('EHM0001')) # not found + # print(main('EHM0001')) # not found + # print(main('FC2-2314275')) + # print(main('EBOD-646')) + # print(main('LOVE-262')) + print(main('ABP-890')) diff --git a/WebCrawler/mgstage.py b/WebCrawler/mgstage.py index 59f457272..8f58cb695 100644 --- a/WebCrawler/mgstage.py +++ b/WebCrawler/mgstage.py @@ -137,7 +137,7 @@ def main(number2): 'series': getSeries(a), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} diff --git a/WebCrawler/storyline.py b/WebCrawler/storyline.py new file mode 100644 index 000000000..9b0a44c70 --- /dev/null +++ b/WebCrawler/storyline.py @@ -0,0 +1,334 @@ +import sys +sys.path.append('../') +import re +import json +import builtins +from ADC_function import * +from multiprocessing import Pool +from multiprocessing.dummy import Pool as ThreadPool +from difflib import SequenceMatcher +from unicodedata import category +from number_parser import is_uncensored + +G_registered_storyline_site = {"airav", "avno1", "xcity", "amazon", "58avgo"} + +G_mode_txt = ('顺序执行','线程池','进程池') + +class noThread(object): + def map(self, fn, param): + return builtins.map(fn, param) + def __enter__(self): + return self + def __exit__(self, exc_type, exc_val, exc_tb): + pass + + +# 获取剧情介绍 从列表中的站点同时查,取值优先级从前到后 +def getStoryline(number, title, sites: list=None): + start_time = time.time() + conf = config.getInstance() + debug = conf.debug() or conf.storyline_show() == 2 + storyine_sites = conf.storyline_site().split(',') if sites is None else sites + if is_uncensored(number): + storyine_sites += conf.storyline_uncensored_site().split(',') + else: + storyine_sites += conf.storyline_censored_site().split(',') + r_dup = set() + apply_sites = [] + for s in storyine_sites: + if s in G_registered_storyline_site and s not in r_dup: + apply_sites.append(s) + r_dup.add(s) + mp_args = ((site, number, title, debug) for site in apply_sites) + cores = min(len(apply_sites), os.cpu_count()) + if cores == 0: + return '' + run_mode = conf.storyline_mode() + assert run_mode in (0,1,2) + with ThreadPool(cores) if run_mode == 1 else Pool(cores) if run_mode == 2 else noThread() as pool: + result = pool.map(getStoryline_mp, mp_args) + result = list(result) if run_mode == 0 else result + if not debug and conf.storyline_show() == 0: + for value in result: + if isinstance(value, str) and len(value): + return value + return '' + # 以下debug结果输出会写入日志,进程池中的则不会,只在标准输出中显示 + cnt = len(apply_sites) + s = f'[!]Storyline{G_mode_txt[run_mode]}模式运行{cnt}个进程总用时(含启动开销){time.time() - start_time:.3f}秒,结束于{time.strftime("%H:%M:%S")}' + first = True + sel = '' + for i in range(cnt): + sl = len(result[i])if isinstance(result[i], str) else 0 + if sl and first: + s += f',[选中{apply_sites[i]}字数:{sl}]' + first = False + sel = result[i] + elif sl: + s += f',{apply_sites[i]}字数:{sl}' + else: + s += f',{apply_sites[i]}:空' + print(s) + return sel + + +def getStoryline_mp(args): + return _getStoryline_mp(*args) + + +# 注:新进程的print()不会写入日志中,将来调试修复失效数据源需直接查看标准输出,issue信息需截图屏幕 +def _getStoryline_mp(site, number, title, debug): + start_time = time.time() + storyline = None + if not isinstance(site, str): + return storyline + elif site == "airav": + storyline = getStoryline_airav(number, debug) + elif site == "avno1": + storyline = getStoryline_avno1(number, debug) + elif site == "xcity": + storyline = getStoryline_xcity(number, debug) + elif site == "amazon": + storyline = getStoryline_amazon(title, number, debug) + elif site == "58avgo": + storyline = getStoryline_58avgo(number, debug) + if not debug: + return storyline + print("[!]MP 进程[{}]运行{:.3f}秒,结束于{}返回结果: {}".format( + site, + time.time() - start_time, + time.strftime("%H:%M:%S"), + storyline if isinstance(storyline, str) and len(storyline) else '[空]') + ) + return storyline + + +def getStoryline_airav(number, debug): + try: + number_up = number + site = secrets.choice(('airav.cc','airav4.club')) + url = f'https://{site}/searchresults.aspx?Search={number}&Type=0' + res, browser = get_html_by_browser(url, return_type='browser') + if not res.ok: + raise ValueError(f"get_html_by_browser('{url}') failed") + avs = browser.page.select_one('div.resultcontent > ul > li:nth-child(1) > div') + if number_up not in avs.select_one('a > h3').text.upper(): + raise ValueError("number not found") + detail_url = avs.select_one('a')['href'] + res = browser.open_relative(detail_url) + if not res.ok: + raise ValueError(f"browser.open_relative('{detail_url}') failed") + t = browser.page.select_one('head > title').text + airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper() + if number.upper() != airav_number: + raise ValueError(f"page number ->[{airav_number}] not match") + desc = browser.page.select_one('li.introduction > span').text.strip() + return desc + except Exception as e: + if debug: + print(f"[-]MP getOutline_amazon Error: {e},number [{number}].") + pass + return None + + +def getStoryline_58avgo(number, debug): + try: + url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([ + '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12', + '?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 + kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number + result, browser = get_html_by_form(url, + fields = {'ctl00$TextBox_SearchKeyWord' : kwd}, + return_type = 'browser') + if not result.ok: + raise ValueError(f"get_html_by_form('{url}','{number}') failed") + if f'searchresults.aspx?Search={kwd}' not in browser.url: + raise ValueError("number not found") + s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click') + link = None + for i in range(len(s)): + title = s[i].h3.text.strip() + if re.search(number, title, re.I): + link = s[i] + break; + if link is None: + raise ValueError("number not found") + result = browser.follow_link(link) + if not result.ok or 'playon.aspx' not in browser.url: + raise ValueError("detail page not found") + title = browser.page.select('head > title')[0].text.strip() + detail_number = str(re.findall('\[(.*?)]', title)[0]) + if not re.search(number, detail_number, re.I): + raise ValueError("detail page number not match, got ->[{detail_number}]") + return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得 + try: + url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' + + secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']), + '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php' + ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一 + result, browser = get_html_by_form(url, + form_select='div.wrapper > div.header > div.search > form', + fields = {'kw' : number}, + return_type = 'browser') + if not result.ok: + raise ValueError(f"get_html_by_form('{url}','{number}') failed") + s = browser.page.select('div.type_movie > div > ul > li > div') + for i in range(len(s)): + title = s[i].a.h3.text.strip() + page_number = title[title.rfind(' '):].strip() + if re.search(number, page_number, re.I): + return s[i]['data-description'].strip() + raise ValueError(f"page number ->[{page_number}] not match") + except Exception as e: + if debug: + print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得 + try: + xcity_number = number.replace('-','') + query_result, browser = get_html_by_form( + 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), + fields = {'q' : xcity_number.lower()}, + return_type = 'browser') + if not query_result or not query_result.ok: + raise ValueError("page not found") + result = browser.follow_link(browser.links('avod\/detail')[0]) + if not result.ok: + raise ValueError("detail page not found") + return browser.page.select_one('h2.title-detail + p.lead').text.strip() + except Exception as e: + if debug: + print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].") + pass + return '' + + +def getStoryline_amazon(q_title, number, debug): + if not isinstance(q_title, str) or not len(q_title): + return None + try: + amazon_cookie, _ = load_cookies('amazon.json') + cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None + url = "https://www.amazon.co.jp/s?k=" + q_title + res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser') + if not res.ok: + raise ValueError("get_html_by_browser() failed") + lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') + if isinstance(lks, list) and len(lks): + browser.follow_link(lks[0]) + cookie = None + html = etree.fromstring(str(browser.page), etree.HTMLParser()) + titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()") + urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href") + if not len(urls) or len(urls) != len(titles): + raise ValueError("titles not found") + idx = amazon_select_one(titles, q_title, number, debug) + if not isinstance(idx, int) or idx < 0: + raise ValueError("title and number not found") + furl = urls[idx] + r = browser.open_relative(furl) + if not r.ok: + raise ValueError("browser.open_relative()) failed.") + lks = browser.links(r'/black-curtain/save-eligibility/black-curtain') + if isinstance(lks, list) and len(lks): + browser.follow_link(lks[0]) + cookie = None + + ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip() + ama_t = re.sub(r'審査番号:\d+', '', ama_t) + + if cookie is None: + # 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径 + ama_save = Path.home() / ".local/share/avdc/amazon.json" + ama_save.parent.mkdir(parents=True, exist_ok=True) + ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8') + + return ama_t + + except Exception as e: + if debug: + print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}') + pass + return None + +# 查货架中DVD和蓝光商品中标题相似度高的 +def amazon_select_one(a_titles, q_title, number, debug): + sel = -1 + ratio = 0 + que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A)) + for loc in range(len(a_titles)): + t = a_titles[loc] + if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过 + return loc + if not re.search('DVD|Blu-ray', t, re.I): + continue + ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I)) + ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A)) + findlen = 0 + lastpos = -1 + cnt = len(ama_t) + for c in reversed(ama_t): + cnt -= 1 + pos = que_t.rfind(c) + if lastpos >= 0: + pos_near = que_t[:lastpos].rfind(c) + if pos_near < 0: + findlen = 0 + lastpos = -1 + ama_t = ama_t[:cnt+1] + else: + pos = pos_near + if pos < 0: + if category(c) == 'Nd': + return -1 + ama_t = ama_t[:cnt] + findlen = 0 + lastpos = -1 + continue + if findlen > 0 and len(que_t) > 1 and lastpos == pos+1: + findlen += 1 + lastpos = pos + if findlen >= 4: + break + continue + findlen = 1 + lastpos = pos + if findlen==0: + return -1 + r = SequenceMatcher(None, ama_t, que_t).ratio() + if r > ratio: + sel = loc + ratio = r + save_t_ = ama_t + if ratio > 0.999: + break + + if ratio < 0.5: + return -1 + + if not debug: + # 目前采信相似度高于0.9的结果 + return sel if ratio >= 0.9 else -1 + + # debug 模式下记录识别准确率日志 + if ratio < 0.9: + # 相似度[0.5, 0.9)的淘汰结果单独记录日志 + (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write( + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n') + return -1 + # 被采信的结果日志 + (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write( + f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n') + return sel diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py index a7b4cffc2..ed381e75c 100644 --- a/WebCrawler/xcity.py +++ b/WebCrawler/xcity.py @@ -3,16 +3,12 @@ import re from lxml import etree import json -from bs4 import BeautifulSoup from ADC_function import * - - -# import sys +from WebCrawler.storyline import getStoryline # import io # sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True) -def getTitle(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getTitle(html): result = html.xpath('//*[@id="program_detail_title"]/text()')[0] return result @@ -43,8 +39,7 @@ def getActorPhoto(browser): return o -def getStudio(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getStudio(html): try: result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']") except: @@ -52,20 +47,14 @@ def getStudio(a): return result.strip('+').replace("', '", '').replace('"', '') -def getRuntime(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRuntime(html): try: - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0] + x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip() + return x except: return '' - try: - return re.findall('\d+',result1)[0] - except: - return '' - -def getLabel(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getLabel(html): try: result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0] return result @@ -73,8 +62,7 @@ def getLabel(a): return '' -def getNum(a): - html = etree.fromstring(a, etree.HTMLParser()) +def getNum(html): try: result = html.xpath('//*[@id="hinban"]/text()')[0] return result @@ -90,8 +78,7 @@ def getYear(getRelease): return getRelease -def getRelease(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getRelease(html): try: result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1]) except: @@ -102,31 +89,22 @@ def getRelease(a): return '' -def getTag(a): - result2=[] - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() - result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()') - for i in result1: - i=i.replace(u'\n','') - i=i.replace(u'\t','') - if len(i): - result2.append(i) - return result2 +def getTag(html): + x = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()') + return [translateTag_to_sc(i.strip()) for i in x if len(i.strip())] if len(x) and len(x[0]) else [] -def getCover_small(a, index=0): +def getCover_small(html, index=0): # same issue mentioned below, # javdb sometime returns multiple results # DO NOT just get the firt one, get the one with correct index number - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index] if not 'https' in result: result = 'https:' + result return result -def getCover(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getCover(html): try: result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0] return 'https:' + result @@ -134,8 +112,7 @@ def getCover(htmlcode): return '' -def getDirector(a): - html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() +def getDirector(html): try: result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '') return result @@ -143,19 +120,21 @@ def getDirector(a): return '' -def getOutline(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getOutline(html, number, title): + storyline_site = config.getInstance().storyline_site().split(',') + a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字 + if len(a): + site = [n for n in storyline_site if n in a] + g = getStoryline(number, title, site) + if len(g): + return g try: - result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0] + x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0] + return x.replace(getNum(html), '') except: return '' - try: - return re.sub('\\\\\w*\d+','',result) - except: - return result -def getSeries(htmlcode): - html = etree.fromstring(htmlcode, etree.HTMLParser()) +def getSeries(html): try: try: result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0] @@ -181,11 +160,10 @@ def getExtrafanart(htmlcode): # 获取剧照 return s return '' -def main(number): - try: +def open_by_browser(number): xcity_number = number.replace('-','') query_result, browser = get_html_by_form( - 'https://xcity.jp/about/', + 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']), fields = {'q' : xcity_number.lower()}, return_type = 'browser') if not query_result or not query_result.ok: @@ -193,38 +171,44 @@ def main(number): result = browser.follow_link(browser.links('avod\/detail')[0]) if not result.ok: raise ValueError("xcity.py: detail page not found") - detail_page = str(browser.page) + return str(browser.page), browser + +def main(number): + try: + detail_page, browser = open_by_browser(number) url = browser.url - newnum = getNum(detail_page).upper() + lx = etree.fromstring(detail_page, etree.HTMLParser()) + newnum = getNum(lx).upper() number_up = number.upper() if newnum != number_up: - if newnum == xcity_number.upper(): + if newnum == number.replace('-','').upper(): newnum = number_up else: raise ValueError("xcity.py: number not found") + title = getTitle(lx) dic = { 'actor': getActor(browser), - 'title': getTitle(detail_page), - 'studio': getStudio(detail_page), - 'outline': getOutline(detail_page), - 'runtime': getRuntime(detail_page), - 'director': getDirector(detail_page), - 'release': getRelease(detail_page), + 'title': title, + 'studio': getStudio(lx), + 'outline': getOutline(lx, number, title), + 'runtime': getRuntime(lx), + 'director': getDirector(lx), + 'release': getRelease(lx), 'number': newnum, - 'cover': getCover(detail_page), + 'cover': getCover(lx), 'cover_small': '', 'extrafanart': getExtrafanart(detail_page), 'imagecut': 1, - 'tag': getTag(detail_page), - 'label': getLabel(detail_page), - 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()), + 'tag': getTag(lx), + 'label': getLabel(lx), + 'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()), # 'actor_photo': getActorPhoto(browser), 'website': url, 'source': 'xcity.py', - 'series': getSeries(detail_page), + 'series': getSeries(lx), } except Exception as e: - if config.Config().debug(): + if config.getInstance().debug(): print(e) dic = {"title": ""} diff --git a/config.ini b/config.ini index 58e6892cb..eef14db54 100755 --- a/config.ini +++ b/config.ini @@ -1,12 +1,13 @@ [common] main_mode=1 +source_folder=./ failed_output_folder=failed success_output_folder=JAV_output soft_link=0 failed_move=1 auto_exit=0 transalte_to_sc=0 -multi_threading=1 +multi_threading=0 ;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧) actor_gender=female del_empty_folder=1 @@ -16,6 +17,8 @@ nfo_skip_days=30 ; 处理完多少个视频文件后停止,0为处理所有视频文件 stop_counter=0 ; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁 +ignore_failed_list=0 +download_only_missing_images=1 [proxy] ;proxytype: http or socks5 or socks5h switch: 0 1 @@ -62,8 +65,7 @@ switch=0 ; 用来确定是否是无码 [uncensored] -uncensored_prefix=S2M,BT,LAF,SMD - +uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,RED [media] ; 影片后缀 @@ -82,3 +84,20 @@ water=2 switch=0 extrafanart_folder=extrafanart +; 剧情简介 +[storyline] +; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的 +; 可选数据源站点列表。列表内站点同时并发查询,取值优先级从左到右,靠左站点没数据才会采用后面站点获得的。 +; 其中airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1有码无码都能查,58avgo只能查无码或者 +; 流出破解马赛克的影片(此功能没使用)。 +; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询, +; 设置成不查询可大幅提高刮削速度。 +; site= +site=avno1 +censored_site=airav,xcity,amazon +uncensored_site=58avgo +; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快) +run_mode=1 +; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因 +show_result=0 + diff --git a/config.py b/config.py index 82fd34573..f6d6488ef 100644 --- a/config.py +++ b/config.py @@ -1,33 +1,82 @@ import os +import re import sys import configparser -import codecs from pathlib import Path + +G_conf_override = { + # index 0 save Config() first instance for quick access by using getInstance() + 0 : None, + # register override config items + "common:main_mode" : None, + "common:source_folder" : None, + "common:auto_exit" : None, + "common:nfo_skip_days" : None, + "common:stop_counter" : None, + "common:ignore_failed_list" : None, + "debug_mode:switch" : None +} + + +def getInstance(): + if isinstance(G_conf_override[0], Config): + return G_conf_override[0] + return Config() + + class Config: def __init__(self, path: str = "config.ini"): - path_search_order = [ - path, - "./config.ini", - os.path.join(Path.home(), "avdc.ini"), - os.path.join(Path.home(), ".avdc.ini"), - os.path.join(Path.home(), ".avdc/config.ini"), - os.path.join(Path.home(), ".config/avdc/config.ini") - ] + path_search_order = ( + Path(path), + Path.cwd() / "config.ini", + Path.home() / "avdc.ini", + Path.home() / ".avdc.ini", + Path.home() / ".avdc/config.ini", + Path.home() / ".config/avdc/config.ini" + ) ini_path = None for p in path_search_order: - if os.path.isfile(p): - ini_path = p + if p.is_file(): + ini_path = p.resolve() break if ini_path: self.conf = configparser.ConfigParser() + self.ini_path = ini_path try: - self.conf.read(ini_path, encoding="utf-8-sig") + if self.conf.read(ini_path, encoding="utf-8-sig"): + if G_conf_override[0] is None: + G_conf_override[0] = self except: - self.conf.read(ini_path, encoding="utf-8") + if self.conf.read(ini_path, encoding="utf-8"): + if G_conf_override[0] is None: + G_conf_override[0] = self else: - print("[-]Config file not found!") - sys.exit(2) + print("ERROR: Config file not found!") + print("Please put config file into one of the following path:") + print('\n'.join([str(p.resolve()) for p in path_search_order[2:]])) + # 对于找不到配置文件的情况,还是在打包时附上对应版本的默认配置文件,有需要时为其在搜索路径中生成, + # 要比用户乱找一个版本不对应的配置文件会可靠些。这样一来,单个执行文件就是功能完整的了,放在任何 + # 执行路径下都可以放心使用。 + res_path = None + # pyinstaller打包的在打包中找config.ini + if hasattr(sys, '_MEIPASS') and (Path(getattr(sys, '_MEIPASS')) / 'config.ini').is_file(): + res_path = Path(getattr(sys, '_MEIPASS')) / 'config.ini' + # 脚本运行的所在位置找 + elif (Path(__file__).resolve().parent / 'config.ini').is_file(): + res_path = Path(__file__).resolve().parent / 'config.ini' + if res_path is None: + sys.exit(2) + ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:") + if re.search('n', ins, re.I): + sys.exit(2) + # 用户目录才确定具有写权限,因此选择 ~/avdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的 + # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。 + write_path = path_search_order[2] # Path.home() / "avdc.ini" + write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8') + print("Config file '{}' created.".format(write_path.resolve())) + input("Press Enter key exit...") + sys.exit(0) # self.conf = self._default_config() # try: # self.conf = configparser.ConfigParser() @@ -40,13 +89,24 @@ def __init__(self, path: str = "config.ini"): # print("[-]",e) # sys.exit(3) # #self.conf = self._default_config() + def getboolean_override(self, section, item) -> bool: + return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(G_conf_override[f"{section}:{item}"]) + + def getint_override(self, section, item) -> int: + return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(G_conf_override[f"{section}:{item}"]) + + def get_override(self, section, item) -> str: + return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(G_conf_override[f"{section}:{item}"]) - def main_mode(self) -> str: + def main_mode(self) -> int: try: - return self.conf.getint("common", "main_mode") + return self.getint_override("common", "main_mode") except ValueError: self._exit("common:main_mode") + def source_folder(self) -> str: + return self.get_override("common", "source_folder") + def failed_folder(self) -> str: return self.conf.get("common", "failed_output_folder") @@ -61,7 +121,7 @@ def soft_link(self) -> bool: def failed_move(self) -> bool: return self.conf.getboolean("common", "failed_move") def auto_exit(self) -> bool: - return self.conf.getboolean("common", "auto_exit") + return self.getboolean_override("common", "auto_exit") def transalte_to_sc(self) -> bool: return self.conf.getboolean("common", "transalte_to_sc") def multi_threading(self) -> bool: @@ -70,14 +130,18 @@ def del_empty_folder(self) -> bool: return self.conf.getboolean("common", "del_empty_folder") def nfo_skip_days(self) -> int: try: - return self.conf.getint("common", "nfo_skip_days") + return self.getint_override("common", "nfo_skip_days") except: return 30 def stop_counter(self) -> int: try: - return self.conf.getint("common", "stop_counter") + return self.getint_override("common", "stop_counter") except: return 0 + def ignore_failed_list(self) -> bool: + return self.getboolean_override("common", "ignore_failed_list") + def download_only_missing_images(self) -> bool: + return self.conf.getboolean("common", "download_only_missing_images") def is_transalte(self) -> bool: return self.conf.getboolean("transalte", "switch") def is_trailer(self) -> bool: @@ -173,7 +237,39 @@ def escape_folder(self) -> str: return self.conf.get("escape", "folders") def debug(self) -> bool: - return self.conf.getboolean("debug_mode", "switch") + return self.getboolean_override("debug_mode", "switch") + + def storyline_site(self) -> str: + try: + return self.conf.get("storyline", "site") + except: + return "avno1" + + def storyline_censored_site(self) -> str: + try: + return self.conf.get("storyline", "censored_site") + except: + return "airav,xcity,amazon" + + def storyline_uncensored_site(self) -> str: + try: + return self.conf.get("storyline", "uncensored_site") + except: + return "58avgo" + + def storyline_show(self) -> int: + try: + v = self.conf.getint("storyline", "show_result") + return v if v in (0,1,2) else 2 if v > 2 else 0 + except: + return 0 + + def storyline_mode(self) -> int: + try: + v = self.conf.getint("storyline", "run_mode") + return v if v in (0,1,2) else 2 if v > 2 else 0 + except: + return 1 @staticmethod def _exit(sec: str) -> None: @@ -188,6 +284,7 @@ def _default_config() -> configparser.ConfigParser: sec1 = "common" conf.add_section(sec1) conf.set(sec1, "main_mode", "1") + conf.set(sec1, "source_folder", "./") conf.set(sec1, "failed_output_folder", "failed") conf.set(sec1, "success_output_folder", "JAV_output") conf.set(sec1, "soft_link", "0") @@ -199,6 +296,8 @@ def _default_config() -> configparser.ConfigParser: conf.set(sec1, "del_empty_folder", "1") conf.set(sec1, "nfo_skip_days", 30) conf.set(sec1, "stop_counter", 0) + conf.set(sec1, "ignore_failed_list", 0) + conf.set(sec1, "download_only_missing_images", 1) sec2 = "proxy" conf.add_section(sec2) @@ -265,6 +364,14 @@ def _default_config() -> configparser.ConfigParser: conf.set(sec13, "switch", 1) conf.set(sec13, "extrafanart_folder", "extrafanart") + sec14 = "storyline" + conf.add_section(sec14) + conf.set(sec14, "site", "avno1") + conf.set(sec14, "censored_site", "airav,xcity,amazon") + conf.set(sec14, "uncensored_site", "58avgo") + conf.set(sec14, "show_result", 0) + conf.set(sec14, "run_mode", 1) + return conf @@ -308,9 +415,45 @@ def evprint(evstr): code = compile(evstr, "<string>", "eval") print('{}: "{}"'.format(evstr, eval(code))) config = Config() - mfilter = ('conf', 'proxy', '_exit', '_default_config') + mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path'} for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]: evprint(f'config.{_m}()') - pfilter = ('proxies', 'SUPPORT_PROXY_TYPE') - for _p in [p for p in dir(config.proxy()) if not p.startswith('__') and p not in pfilter]: - evprint(f'config.proxy().{_p}') + pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'} + # test getInstance() + assert(getInstance() == config) + for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]: + evprint(f'getInstance().proxy().{_p}') + + # Override Test + G_conf_override["common:nfo_skip_days"] = 4321 + G_conf_override["common:stop_counter"] = 1234 + assert config.nfo_skip_days() == 4321 + assert getInstance().stop_counter() == 1234 + # remove override + G_conf_override["common:stop_counter"] = None + G_conf_override["common:nfo_skip_days"] = None + assert config.nfo_skip_days() != 4321 + assert config.stop_counter() != 1234 + # Create new instance + conf2 = Config() + assert getInstance() != conf2 + assert getInstance() == config + G_conf_override["common:main_mode"] = 9 + G_conf_override["common:source_folder"] = "A:/b/c" + # Override effect to all instances + assert config.main_mode() == 9 + assert conf2.main_mode() == 9 + assert getInstance().main_mode() == 9 + assert conf2.source_folder() == "A:/b/c" + print("### Override Test ###".center(36)) + evprint('getInstance().main_mode()') + evprint('config.source_folder()') + G_conf_override["common:main_mode"] = None + evprint('conf2.main_mode()') + evprint('config.main_mode()') + # unregister key acess will raise except + try: + print(G_conf_override["common:actor_gender"]) + except KeyError as ke: + print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr) + print(f"Load Config file '{conf2.ini_path}'.") diff --git a/core.py b/core.py index cb1a78238..24c1ce51c 100755 --- a/core.py +++ b/core.py @@ -3,8 +3,6 @@ import pathlib import re import shutil -import platform -import errno import sys from PIL import Image @@ -14,7 +12,7 @@ from ADC_function import * from WebCrawler import get_data_from_json - +from number_parser import is_uncensored def escape_path(path, escape_literals: str): # Remove escape literals backslash = '\\' @@ -23,7 +21,8 @@ def escape_path(path, escape_literals: str): # Remove escape literals return path -def moveFailedFolder(filepath, conf): +def moveFailedFolder(filepath): + conf = config.getInstance() failed_folder = conf.failed_folder() soft_link = conf.soft_link() # 模式3或软连接,改为维护一个失败列表,启动扫描时加载用于排除该路径,以免反复处理 @@ -33,7 +32,6 @@ def moveFailedFolder(filepath, conf): print("[-]Add to Failed List file, see '%s'" % ftxt) with open(ftxt, 'a', encoding='utf-8') as flt: flt.write(f'{filepath}\n') - flt.close() elif conf.failed_move() and not soft_link: failed_name = os.path.join(failed_folder, os.path.basename(filepath)) mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt')) @@ -41,8 +39,13 @@ def moveFailedFolder(filepath, conf): with open(mtxt, 'a', encoding='utf-8') as wwibbmt: tmstr = datetime.now().strftime("%Y-%m-%d %H:%M") wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n') - wwibbmt.close() - shutil.move(filepath, failed_name) + try: + if os.path.exists(failed_name): + print('[-]File Exists while moving to FailedFolder') + return + shutil.move(filepath, failed_name) + except: + print('[-]File Moving to FailedFolder unsuccessful!') def get_info(json_data): # 返回json里的数据 @@ -63,14 +66,15 @@ def get_info(json_data): # 返回json里的数据 return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label -def small_cover_check(path, number, cover_small, leak_word, c_word, conf: config.Config, filepath): +def small_cover_check(path, number, cover_small, leak_word, c_word, filepath): filename = f"{number}{leak_word}{c_word}-poster.jpg" - download_file_with_filename(cover_small, filename, path, conf, filepath) + download_file_with_filename(cover_small, filename, path, filepath) print('[+]Image Downloaded! ' + os.path.join(path, filename)) -def create_folder(json_data, conf: config.Config): # 创建文件夹 +def create_folder(json_data): # 创建文件夹 title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) + conf = config.getInstance() success_folder = conf.success_folder() actor = json_data.get('actor') location_rule = eval(conf.location_rule(), json_data) @@ -81,35 +85,40 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹 if 'title' in conf.location_rule() and len(title) > maxlen: shorttitle = title[0:maxlen] location_rule = location_rule.replace(title, shorttitle) - - path = os.path.join(success_folder, location_rule).strip() - if not os.path.isdir(path): + # 当演员为空时,location_rule被计算为'/number'绝对路径,导致路径连接忽略第一个路径参数,因此添加./使其始终为相对路径 + path = os.path.join(success_folder, f'./{location_rule.strip()}') + if not os.path.exists(path): path = escape_path(path, conf.escape_literals()) try: os.makedirs(path) - if not os.path.isdir(path): - raise except: path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number") path = escape_path(path, conf.escape_literals()) + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + sys.exit(0) - os.makedirs(path) - return path + return os.path.normpath(path) # =====================资源下载部分=========================== # path = examle:photo , video.in the Project Folder! -def download_file_with_filename(url, filename, path, conf: config.Config, filepath): +def download_file_with_filename(url, filename, path, filepath): + conf = config.getInstance() configProxy = conf.proxy() for i in range(configProxy.retry): try: if configProxy.enable: - if not os.path.isdir(path): - os.makedirs(path) - if not os.path.isdir(path): - raise IOError + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + sys.exit(0) proxies = configProxy.proxies() headers = { 'User-Agent': G_USER_AGENT} @@ -121,10 +130,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa code.write(r.content) return else: - if not os.path.isdir(path): - os.makedirs(path) - if not os.path.isdir(path): - raise IOError + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! Can not make folder '{path}'") + sys.exit(0) headers = { 'User-Agent': G_USER_AGENT} r = requests.get(url, timeout=configProxy.timeout, headers=headers) @@ -148,46 +159,50 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa print('[-]Image Download : Connect retry ' + str(i) + '/' + str(configProxy.retry)) except IOError: print(f"[-]Create Directory '{path}' failed!") - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return print('[-]Connect Failed! Please check your Proxy or Network!') - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return -def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: config.Config): - if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) == 'failed': +def trailer_download(trailer, leak_word, c_word, number, path, filepath): + if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) == 'failed': return - configProxy = conf.proxy() + configProxy = config.getInstance().proxy() for i in range(configProxy.retry): - if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0: + if file_not_exist_or_empty(path+'/' + number + leak_word + c_word + '-trailer.mp4'): print('[!]Video Download Failed! Trying again. [{}/3]', i + 1) - download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) + download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) continue else: break - if os.path.getsize(path + '/' + number + leak_word + c_word + '-trailer.mp4') == 0: + if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + '-trailer.mp4'): return print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4') # 剧照下载成功,否则移动到failed -def extrafanart_download(data, path, conf: config.Config, filepath): +def extrafanart_download(data, path, filepath): j = 1 + conf = config.getInstance() path = os.path.join(path, conf.get_extrafanart()) + configProxy = conf.proxy() + download_only_missing_images = conf.download_only_missing_images() for url in data: jpg_filename = f'extrafanart-{j}.jpg' jpg_fullpath = os.path.join(path, jpg_filename) - if download_file_with_filename(url, jpg_filename, path, conf, filepath) == 'failed': - moveFailedFolder(filepath, conf) + if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath): + continue + if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed': + moveFailedFolder(filepath) return - configProxy = conf.proxy() for i in range(configProxy.retry): - if os.path.getsize(jpg_fullpath) == 0: + if file_not_exist_or_empty(jpg_fullpath): print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) - download_file_with_filename(url, jpg_filename, path, conf, filepath) + download_file_with_filename(url, jpg_filename, path, filepath) continue else: break - if os.path.getsize(jpg_fullpath) == 0: + if file_not_exist_or_empty(jpg_fullpath): return print('[+]Image Downloaded!', jpg_fullpath) j += 1 @@ -195,39 +210,46 @@ def extrafanart_download(data, path, conf: config.Config, filepath): # 封面是否下载成功,否则移动到failed -def image_download(cover, number, leak_word, c_word, path, conf: config.Config, filepath): +def image_download(cover, number, leak_word, c_word, path, filepath): filename = f"{number}{leak_word}{c_word}-fanart.jpg" full_filepath = os.path.join(path, filename) - if download_file_with_filename(cover, filename, path, conf, filepath) == 'failed': - moveFailedFolder(filepath, conf) + if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath): + return + if download_file_with_filename(cover, filename, path, filepath) == 'failed': + moveFailedFolder(filepath) return - configProxy = conf.proxy() + configProxy = config.getInstance().proxy() for i in range(configProxy.retry): - if os.path.getsize(full_filepath) == 0: + if file_not_exist_or_empty(full_filepath): print('[!]Image Download Failed! Trying again. [{}/3]', i + 1) - download_file_with_filename(cover, filename, path, conf, filepath) + download_file_with_filename(cover, filename, path, filepath) continue else: break - if os.path.getsize(full_filepath) == 0: + if file_not_exist_or_empty(full_filepath): return print('[+]Image Downloaded!', full_filepath) shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")) -def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf): +def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored): title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data) - failed_folder = conf.failed_folder() - if conf.main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持 + if config.getInstance().main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持 nfo_path = str(Path(filepath).with_suffix('.nfo')) else: nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo") try: - if not os.path.isdir(path): - os.makedirs(path) - if not os.path.isdir(path): - raise IOError + if not os.path.exists(path): + try: + os.makedirs(path) + except: + print(f"[-]Fatal error! can not make folder '{path}'") + sys.exit(0) + + # KODI内查看影片信息时找不到number,配置naming_rule=number+'#'+title虽可解决 + # 但使得标题太长,放入时常为空的outline内会更适合,软件给outline留出的显示版面也较大 + outline = f"{number}#{outline}" with open(nfo_path, "wt", encoding='UTF-8') as code: print('<?xml version="1.0" encoding="UTF-8" ?>', file=code) print("<movie>", file=code) @@ -279,7 +301,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f print(" <num>" + number + "</num>", file=code) print(" <premiered>" + release + "</premiered>", file=code) print(" <cover>" + cover + "</cover>", file=code) - if config.Config().is_trailer(): + if config.getInstance().is_trailer(): print(" <trailer>" + trailer + "</trailer>", file=code) print(" <website>" + website + "</website>", file=code) print("</movie>", file=code) @@ -287,12 +309,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f except IOError as e: print("[-]Write Failed!") print("[-]", e) - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return except Exception as e1: print("[-]Write Failed!") print("[-]", e1) - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return @@ -321,7 +343,7 @@ def cutImage(imagecut, path, number, leak_word, c_word): # leak 流出 参数值为 1 0 # uncensored 无码 参数值为 1 0 # ========================================================================加水印 -def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Config): +def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored): mark_type = '' if cn_sub: mark_type += ',字幕' @@ -331,17 +353,17 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Conf mark_type += ',无码' if mark_type == '': return - add_mark_thread(thumb_path, cn_sub, leak, uncensored, conf) + add_mark_thread(thumb_path, cn_sub, leak, uncensored) print('[+]Thumb Add Mark: ' + mark_type.strip(',')) - add_mark_thread(poster_path, cn_sub, leak, uncensored, conf) + add_mark_thread(poster_path, cn_sub, leak, uncensored) print('[+]Poster Add Mark: ' + mark_type.strip(',')) -def add_mark_thread(pic_path, cn_sub, leak, uncensored, conf): +def add_mark_thread(pic_path, cn_sub, leak, uncensored): size = 14 img_pic = Image.open(pic_path) # 获取自定义位置,取余配合pos达到顺时针添加的效果 # 左上 0, 右上 1, 右下 2, 左下 3 - count = conf.watermark_type() + count = config.getInstance().watermark_type() if cn_sub == 1 or cn_sub == '1': add_to_pic(pic_path, img_pic, size, count, 1) # 添加 count = (count + 1) % 4 @@ -391,29 +413,38 @@ def add_to_pic(pic_path, img_pic, size, count, mode): img_pic.save(pic_path, quality=95) # ========================结束================================= -def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config.Config): # 文件路径,番号,后缀,要移动至的位置 +def paste_file_to_folder(filepath, path, number, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置 filepath_obj = pathlib.Path(filepath) houzhui = filepath_obj.suffix file_parent_origin_path = str(filepath_obj.parent) try: targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{houzhui}") + # 任何情况下都不要覆盖,以免遭遇数据源或者引擎错误导致所有文件得到同一个number,逐一 + # 同名覆盖致使全部文件损失且不可追回的最坏情况 + if os.path.exists(targetpath): + raise FileExistsError('File Exists on destination path, we will never overwriting.') + soft_link = config.getInstance().soft_link() # 如果soft_link=1 使用软链接 - if conf.soft_link() == 0: + if soft_link == 0: shutil.move(filepath, targetpath) - elif conf.soft_link() == 1: - # 采用相对路径,以便网络访问时能正确打开视频 - filerelpath = os.path.relpath(filepath, path) - os.symlink(filerelpath, targetpath) - elif conf.soft_link() == 2: + elif soft_link == 1: + # 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持 + # 相对路径径,改用绝对路径方式尝试建立软链接 + try: + filerelpath = os.path.relpath(filepath, path) + os.symlink(filerelpath, targetpath) + except: + os.symlink(filepath_obj.resolve(), targetpath) + elif soft_link == 2: shutil.move(filepath, targetpath) # 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置 # 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪 - # 便于手工找回文件。并将软连接文件名后缀修改,以避免再次被搜刮。 + # 便于手工找回文件。由于目前软链接已经不会被刮削,文件名后缀无需再修改。 targetabspath = os.path.abspath(targetpath) if targetabspath != os.path.abspath(filepath): targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path) - os.symlink(targetrelpath, filepath + '#sym') - sub_res = conf.sub_rule() + os.symlink(targetrelpath, filepath) + sub_res = config.getInstance().sub_rule() for subname in sub_res: sub_filepath = str(filepath_obj.with_suffix(subname)) @@ -422,9 +453,9 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config print('[+]Sub moved!') return True - except FileExistsError: - print('[-]File Exists! Please check your movie!') - print('[-]move to the root folder of the program.') + except FileExistsError as fee: + print(f'[-]FileExistsError: {fee}') + moveFailedFolder(filepath) return except PermissionError: print('[-]Error! Please run as administrator!') @@ -434,19 +465,22 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config return -def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf): # 文件路径,番号,后缀,要移动至的位置 +def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置 if multi_part == 1: number += part # 这时number会被附加上CD1后缀 filepath_obj = pathlib.Path(filepath) houzhui = filepath_obj.suffix file_parent_origin_path = str(filepath_obj.parent) + targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}") + if os.path.exists(targetpath): + raise FileExistsError('File Exists on destination path, we will never overwriting.') try: - if conf.soft_link(): - os.symlink(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")) + if config.getInstance().soft_link(): + os.symlink(filepath, targetpath) else: - shutil.move(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")) + shutil.move(filepath, targetpath) - sub_res = conf.sub_rule() + sub_res = config.getInstance().sub_rule() for subname in sub_res: sub_filepath = str(filepath_obj.with_suffix(subname)) if os.path.isfile(sub_filepath): # 字幕移动 @@ -454,9 +488,8 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo print('[+]Sub moved!') print('[!]Success') return True - except FileExistsError: - print('[-]File Exists! Please check your movie!') - print('[-]move to the root folder of the program.') + except FileExistsError as fee: + print(f'[-]FileExistsError: {fee}') return except PermissionError: print('[-]Error! Please run as administrator!') @@ -465,7 +498,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo print(f'[-]OS Error errno {oserr.errno}') return -def get_part(filepath, conf): +def get_part(filepath): try: if re.search('-CD\d+', filepath): return re.findall('-CD\d+', filepath)[0] @@ -473,7 +506,7 @@ def get_part(filepath, conf): return re.findall('-cd\d+', filepath)[0] except: print("[-]failed!Please rename the filename again!") - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return @@ -493,7 +526,8 @@ def debug_print(data: json): pass -def core_main(file_path, number_th, conf: config.Config): +def core_main(file_path, number_th): + conf = config.getInstance() # =======================================================================初始化所需变量 multi_part = 0 part = '' @@ -507,11 +541,11 @@ def core_main(file_path, number_th, conf: config.Config): # 下面被注释的变量不需要 #rootpath= os.getcwd number = number_th - json_data = get_data_from_json(number, conf) # 定义番号 + json_data = get_data_from_json(number) # 定义番号 # Return if blank dict returned (data not found) if not json_data: - moveFailedFolder(filepath, conf) + moveFailedFolder(filepath) return if json_data["number"] != number: @@ -526,16 +560,13 @@ def core_main(file_path, number_th, conf: config.Config): # =======================================================================判断-C,-CD后缀 if '-CD' in filepath or '-cd' in filepath: multi_part = 1 - part = get_part(filepath, conf) + part = get_part(filepath) if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath: cn_sub = '1' c_word = '-C' # 中文字幕影片后缀 # 判断是否无码 - if is_uncensored(number): - uncensored = 1 - else: - uncensored = 0 + uncensored = 1 if is_uncensored(number) else 0 if '流出' in filepath or 'uncensored' in filepath: @@ -550,7 +581,7 @@ def core_main(file_path, number_th, conf: config.Config): debug_print(json_data) # 创建文件夹 - #path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data, conf) + #path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data) # main_mode # 1: 刮削模式 / Scraping mode @@ -558,54 +589,55 @@ def core_main(file_path, number_th, conf: config.Config): # 3:不改变路径刮削 if conf.main_mode() == 1: # 创建文件夹 - path = create_folder(json_data, conf) + path = create_folder(json_data) if multi_part == 1: number += part # 这时number会被附加上CD1后缀 # 检查小封面, 如果image cut为3,则下载小封面 if imagecut == 3: - small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath) + small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath) # creatFolder会返回番号路径 - image_download( json_data.get('cover'), number, leak_word, c_word, path, conf, filepath) + image_download( json_data.get('cover'), number, leak_word, c_word, path, filepath) if not multi_part or part.lower() == '-cd1': try: # 下载预告片 if conf.is_trailer() and json_data.get('trailer'): - trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) + trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath) except: pass try: - # 下载剧照 data, path, conf: config.Config, filepath + # 下载剧照 data, path, filepath if conf.is_extrafanart() and json_data.get('extrafanart'): - extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) + extrafanart_download(json_data.get('extrafanart'), path, filepath) except: pass # 裁剪图 cutImage(imagecut, path, number, leak_word, c_word) - # 打印文件 - print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, conf) - - # 移动文件 - paste_file_to_folder(filepath, path, number, leak_word, c_word, conf) - + # 添加水印 poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): - add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + add_mark(poster_path, thumb_path, cn_sub, leak, uncensored) + + # 移动电影 + paste_file_to_folder(filepath, path, number, leak_word, c_word) + + # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 + print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored) elif conf.main_mode() == 2: # 创建文件夹 - path = create_folder(json_data, conf) + path = create_folder(json_data) # 移动文件 - paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf) + paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word) poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): - add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + add_mark(poster_path, thumb_path, cn_sub, leak, uncensored) elif conf.main_mode() == 3: path = str(Path(file_path).parent) @@ -614,28 +646,29 @@ def core_main(file_path, number_th, conf: config.Config): # 检查小封面, 如果image cut为3,则下载小封面 if imagecut == 3: - small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath) + small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath) # creatFolder会返回番号路径 - image_download(json_data.get('cover'), number, leak_word, c_word, path, conf, filepath) + image_download(json_data.get('cover'), number, leak_word, c_word, path, filepath) if not multi_part or part.lower() == '-cd1': # 下载预告片 if conf.is_trailer() and json_data.get('trailer'): - trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf) + trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath) - # 下载剧照 data, path, conf: config.Config, filepath + # 下载剧照 data, path, filepath if conf.is_extrafanart() and json_data.get('extrafanart'): - extrafanart_download(json_data.get('extrafanart'), path, conf, filepath) + extrafanart_download(json_data.get('extrafanart'), path, filepath) # 裁剪图 cutImage(imagecut, path, number, leak_word, c_word) - # 打印文件 - print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, - tag, json_data.get('actor_list'), liuchu, uncensored, conf) - + # 添加水印 poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg") thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg") if conf.is_watermark(): - add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf) + add_mark(poster_path, thumb_path, cn_sub, leak, uncensored) + + # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志 + print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, + tag, json_data.get('actor_list'), liuchu, uncensored) diff --git a/number_parser.py b/number_parser.py index 2d1874e1b..4d4fe937a 100755 --- a/number_parser.py +++ b/number_parser.py @@ -1,14 +1,14 @@ import os import re -from core import * - +import sys +import config G_spat = re.compile( "^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@", re.IGNORECASE) -def get_number(debug,filepath: str) -> str: +def get_number(debug,file_path: str) -> str: # """ # >>> from number_parser import get_number # >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4") @@ -32,77 +32,174 @@ def get_number(debug,filepath: str) -> str: # >>> get_number("snis-829-C.mp4") # 'snis-829' # """ - filepath = os.path.basename(filepath) - - if debug == False: - try: - if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 - #filepath = filepath.replace("_", "-") - filepath = G_spat.sub("", filepath) - filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 - lower_check = filename.lower() - if 'fc2' in lower_check: - filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() - file_number = get_number_by_dict(lower_check) - if file_number: - return file_number - return str(re.search(r'\w+(-|_)\w+', filename, re.A).group()) - else: # 提取不含减号-的番号,FANZA CID - # 欧美番号匹配规则 - oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) - if oumei: - return oumei.group() - - try: - return str( - re.findall(r'(.+?)\.', - str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( - "['']").replace('_', '-') - except: - return re.search(r'(.+?)\.', filepath)[0] - except Exception as e: - print('[-]' + str(e)) - return - elif debug == True: - if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 - #filepath = filepath.replace("_", "-") + filepath = os.path.basename(file_path) + # debug True 和 False 两块代码块合并,原因是此模块及函数只涉及字符串计算,没有IO操作,debug on时输出导致异常信息即可 + try: + file_number = get_number_by_dict(filepath) + if file_number: + return file_number + elif '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号 filepath = G_spat.sub("", filepath) filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间 lower_check = filename.lower() if 'fc2' in lower_check: filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper() - file_number = get_number_by_dict(lower_check) - if file_number: - return file_number return str(re.search(r'\w+(-|_)\w+', filename, re.A).group()) else: # 提取不含减号-的番号,FANZA CID # 欧美番号匹配规则 oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath) if oumei: return oumei.group() - try: return str( re.findall(r'(.+?)\.', - str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( + str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip( "['']").replace('_', '-') except: - return re.search(r'(.+?)\.', filepath)[0] + return str(re.search(r'(.+?)\.', filepath)[0]) + except Exception as e: + if debug: + print(f'[-]Number Parser exception: {e} [{file_path}]') + return None + +# 按javdb数据源的命名规范提取number G_TAKE_NUM_RULES = { - 'tokyo' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.A).group()), - 'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('_', '-'), - '1pon' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('-', '_'), - '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.A).group()).replace('-', '_'), - 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.A).group()) - } + 'tokyo.*hot' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()), + 'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'), + '1pon|mura|paco' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'), + '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'), + 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()), + 'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]), + 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0]) +} -def get_number_by_dict(lower_filename: str) -> str: - for k,v in G_TAKE_NUM_RULES.items(): - if k in lower_filename: - return v(lower_filename) +def get_number_by_dict(filename: str) -> str: + try: + for k,v in G_TAKE_NUM_RULES.items(): + if re.search(k, filename, re.I): + return v(filename) + except: + pass return None -# if __name__ == "__main__": +class Cache_uncensored_conf: + prefix = None + def is_empty(self): + return bool(self.prefix is None) + def set(self, v: list): + if not v or not len(v) or not len(v[0]): + raise ValueError('input prefix list empty or None') + s = v[0] + if len(v) > 1: + for i in v[1:]: + s += f"|{i}.+" + self.prefix = re.compile(s, re.I) + def check(self, number): + if self.prefix is None: + raise ValueError('No init re compile') + return self.prefix.match(number) + +G_cache_uncensored_conf = Cache_uncensored_conf() + +# ========================================================================是否为无码 +def is_uncensored(number): + if re.match( +r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}', + number, + re.I + ): + return True + if G_cache_uncensored_conf.is_empty(): + G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(',')) + return G_cache_uncensored_conf.check(number) + +if __name__ == "__main__": # import doctest # doctest.testmod(raise_on_error=True) + test_use_cases = ( + "Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取 + "TokyoHot-n1287-HD SP2006 .mp4", + "caribean-020317_001.nfo", # -号误命名为_号的 + "257138_3xplanet_1Pondo_080521_001.mp4", + "ADV-R0624-CD3.wmv", # 多碟影片 + "XXX-AV 22061-CD5.iso", # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源 + "xxx-av 20589.mp4", + "Muramura-102114_145-HD.wmv", # 新支持片商格式 102114_145 命名规则来自javdb数据源 + "heydouga-4102-023-CD2.iso", # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源 + "HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源 + "pacopacomama-093021_539-FHD.mkv" # 新支持片商格式 093021_539 命名规则来自javdb数据源 + ) + def evprint(evstr): + code = compile(evstr, "<string>", "eval") + print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code))) + for t in test_use_cases: + evprint(f'get_number(True, "{t}")') + + if len(sys.argv)<=1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE): + sys.exit(0) + + # 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据,参数为盘符 A .. Z 或带盘符路径 + # https://www.voidtools.com/support/everything/command_line_interface/ + # ES命令行工具需要Everything文件搜索引擎处于运行状态,es.exe单个执行文件需放入PATH路径中。 + # Everything是免费软件 + # 示例: + # python.exe .\number_parser.py ALL # 从所有磁盘搜索视频 + # python.exe .\number_parser.py D # 从D盘搜索 + # python.exe .\number_parser.py D: # 同上 + # python.exe .\number_parser.py D:\download\JAVs # 搜索D盘的\download\JAVs目录,路径必须带盘符 + # ================== + # Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据 + # 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引 + # MAC OS X 使用findutils的glocate,需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引 + # 示例: + # python3 ./number_parser.py ALL + import subprocess + ES_search_path = "ALL disks" + if sys.argv[1] == "ALL": + if sys.platform == "win32": + # ES_prog_path = 'C:/greensoft/es/es.exe' + ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内 + ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v' + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失 + out_list = out_text.splitlines() + elif sys.platform in ("linux", "darwin"): + ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate' + ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path) + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('utf-8') + out_list = [ os.path.basename(line) for line in out_text.splitlines()] + else: + print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.') + sys.exit(1) + else: # Windows single disk + if sys.platform != "win32": + print('[!]Usage: python3 ./number_parser.py ALL') + sys.exit(0) + # ES_prog_path = 'C:/greensoft/es/es.exe' + ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内 + if os.path.isdir(sys.argv[1]): + ES_search_path = sys.argv[1] + else: + ES_search_path = sys.argv[1][0] + ':/' + if not os.path.isdir(ES_search_path): + ES_search_path = 'C:/' + ES_search_path = os.path.normcase(ES_search_path) + ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v' + out_bytes = subprocess.check_output(ES_cmdline.split(' ')) + out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失 + out_list = out_text.splitlines() + print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...') + print(f'[+]Find {len(out_list)} Movies.') + for filename in out_list: + try: + n = get_number(True, filename) + if n: + print(' [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else '')) + else: + print(f'[-]Number return None. # {filename}') + except Exception as e: + print(f'[-]Number Parser exception: {e} [{filename}]') + + sys.exit(0) diff --git a/py_to_exe.ps1 b/py_to_exe.ps1 index 7fc0f803f..77f169aa6 100644 --- a/py_to_exe.ps1 +++ b/py_to_exe.ps1 @@ -3,14 +3,15 @@ $CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1) -mkdir build +mkdir build mkdir __pycache__ pyinstaller --onefile AV_Data_Capture.py ` --hidden-import ADC_function.py ` --hidden-import core.py ` --add-data "$CLOUDSCRAPER_PATH;cloudscraper" ` - --add-data "Img;Img" + --add-data "Img;Img" ` + --add-data "config.ini;." ` rmdir -Recurse -Force build rmdir -Recurse -Force __pycache__ diff --git a/wrapper/FreeBSD.sh b/wrapper/FreeBSD.sh index 70f27d7c9..9717ef469 100755 --- a/wrapper/FreeBSD.sh +++ b/wrapper/FreeBSD.sh @@ -1,4 +1,8 @@ pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448 pip install pyquery pyinstaller -pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img" +pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ + cp config.ini ./dist diff --git a/wrapper/Linux.sh b/wrapper/Linux.sh index 1d05e6a65..63e3b1c19 100755 --- a/wrapper/Linux.sh +++ b/wrapper/Linux.sh @@ -12,5 +12,9 @@ #fi pip3 install -r requirements.txt pip3 install cloudscraper==1.2.52 -pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img" +pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \ + --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \ + --add-data "Img:Img" \ + --add-data "config.ini:." \ + cp config.ini ./dist