diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6b0a748e7..289c88ea4 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -33,7 +33,7 @@ jobs:
- name: Test number_perser.get_number
run: |
python number_parser.py -v
-
+
- name: Build with PyInstaller for macos/ubuntu
if: matrix.os == 'macos-latest' || matrix.os == 'ubuntu-latest'
run: |
@@ -42,6 +42,8 @@ jobs:
--hidden-import ADC_function.py \
--hidden-import core.py \
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
+ --add-data "Img:Img" \
+ --add-data "config.ini:." \
- name: Build with PyInstaller for windows
if: matrix.os == 'windows-latest'
@@ -51,6 +53,8 @@ jobs:
--hidden-import ADC_function.py `
--hidden-import core.py `
--add-data "$(python -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1);cloudscraper" `
+ --add-data "Img;Img" `
+ --add-data "config.ini;." `
- name: Copy config.ini
run: |
diff --git a/ADC_function.py b/ADC_function.py
index b13d0b4d3..36be65773 100755
--- a/ADC_function.py
+++ b/ADC_function.py
@@ -1,8 +1,8 @@
from os import replace
import requests
-import hashlib
+#import hashlib
from pathlib import Path
-import random
+import secrets
import os.path
import uuid
import json
@@ -20,12 +20,12 @@ def getXpathSingle(htmlcode, xpath):
return result1
-G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
+G_USER_AGENT = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
# 网页请求核心
def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None):
- verify = config.Config().cacert_file()
- configProxy = config.Config().proxy()
+ verify = config.getInstance().cacert_file()
+ configProxy = config.getInstance().proxy()
errors = ""
if ua is None:
@@ -61,7 +61,7 @@ def get_html(url, cookies: dict = None, ua: str = None, return_type: str = None)
def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
- configProxy = config.Config().proxy()
+ configProxy = config.getInstance().proxy()
errors = ""
headers_ua = {"User-Agent": G_USER_AGENT}
if headers is None:
@@ -85,8 +85,12 @@ def post_html(url: str, query: dict, headers: dict = None) -> requests.Response:
def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type: str = None):
- browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
- configProxy = config.Config().proxy()
+ s = None
+ if isinstance(cookies, dict) and len(cookies):
+ s = requests.Session()
+ requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
+ browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
+ configProxy = config.getInstance().proxy()
if configProxy.enable:
browser.session.proxies = configProxy.proxies()
result = browser.open(url)
@@ -103,17 +107,19 @@ def get_html_by_browser(url, cookies: dict = None, ua: str = None, return_type:
return result.text
-def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
- browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua)
- if isinstance(cookies, dict):
- requests.utils.add_dict_to_cookiejar(browser.session.cookies, cookies)
- configProxy = config.Config().proxy()
+def get_html_by_form(url, form_select: str = None, fields: dict = None, cookies: dict = None, ua: str = None, return_type: str = None):
+ s = None
+ if isinstance(cookies, dict) and len(cookies):
+ s = requests.Session()
+ requests.utils.add_dict_to_cookiejar(s.cookies, cookies)
+ browser = mechanicalsoup.StatefulBrowser(user_agent=G_USER_AGENT if ua is None else ua, session=s)
+ configProxy = config.getInstance().proxy()
if configProxy.enable:
browser.session.proxies = configProxy.proxies()
result = browser.open(url)
if not result.ok:
return ''
- form = browser.select_form() if form_name is None else browser.select_form(form_name)
+ form = browser.select_form() if form_select is None else browser.select_form(form_select)
if isinstance(fields, dict):
for k, v in fields.items():
browser[k] = v
@@ -131,7 +137,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
# def get_javlib_cookie() -> [dict, str]:
# import cloudscraper
-# switch, proxy, timeout, retry_count, proxytype = config.Config().proxy()
+# switch, proxy, timeout, retry_count, proxytype = config.getInstance().proxy()
# proxies = get_proxy(proxy, proxytype)
#
# raw_cookie = {}
@@ -158,7 +164,7 @@ def get_html_by_form(url, form_name: str = None, fields: dict = None, cookies: d
def translateTag_to_sc(tag):
- tranlate_to_sc = config.Config().transalte_to_sc()
+ tranlate_to_sc = config.getInstance().transalte_to_sc()
if tranlate_to_sc:
dict_gen = {'中文字幕': '中文字幕',
'高清': 'XXXX', '字幕': 'XXXX', '推薦作品': '推荐作品', '通姦': '通奸', '淋浴': '淋浴', '舌頭': '舌头',
@@ -505,8 +511,11 @@ def translate(
delay: int = 0,
):
trans_result = ""
+ # 中文句子如果包含&等符号会被谷歌翻译截断损失内容,而且中文翻译到中文也没有意义,故而忽略,只翻译带有日语假名的
+ if not is_japanese(src):
+ return src
if engine == "google-free":
- gsite = config.Config().get_translate_service_site()
+ gsite = config.getInstance().get_translate_service_site()
if not re.match('^translate\.google\.(com|com\.\w{2}|\w{2})$', gsite):
gsite = 'translate.google.cn'
url = (
@@ -521,7 +530,7 @@ def translate(
trans_result = trans_result.join(translate_list)
# elif engine == "baidu":
# url = "https://fanyi-api.baidu.com/api/trans/vip/translate"
- # salt = random.randint(1, 1435660288)
+ # salt = secrets.randbelow(1435660287) + 1 # random.randint(1, 1435660288)
# sign = app_id + src + str(salt) + key
# sign = hashlib.md5(sign.encode()).hexdigest()
# url += (
@@ -560,17 +569,6 @@ def translate(
return trans_result
-# ========================================================================是否为无码
-def is_uncensored(number):
- if re.match('^\d{4,}', number) or re.match('n\d{4}', number) or 'HEYZO' in number.upper():
- return True
- configs = config.Config().get_uncensored()
- prefix_list = str(configs).split(',')
- for pre in prefix_list:
- if pre.upper() in number.upper():
- return True
- return False
-
# 从浏览器中导出网站登录验证信息的cookies,能够以会员方式打开游客无法访问到的页面
# 示例: FC2-755670 url https://javdb9.com/v/vO8Mn
# json 文件格式
@@ -593,20 +591,20 @@ def load_cookies(filename):
filename = os.path.basename(filename)
if not len(filename):
return None, None
- path_search_order = [
- f"./{filename}",
- os.path.join(Path.home(), filename),
- os.path.join(Path.home(), f".avdc/{filename}"),
- os.path.join(Path.home(), f".local/share/avdc/{filename}")
-]
+ path_search_order = (
+ Path.cwd() / filename,
+ Path.home() / filename,
+ Path.home() / f".avdc/{filename}",
+ Path.home() / f".local/share/avdc/{filename}"
+ )
cookies_filename = None
- for p in path_search_order:
- if os.path.exists(p):
- cookies_filename = os.path.abspath(p)
- break
- if not cookies_filename:
- return None, None
try:
+ for p in path_search_order:
+ if p.is_file():
+ cookies_filename = str(p.resolve())
+ break
+ if not cookies_filename:
+ return None, None
return json.load(open(cookies_filename)), cookies_filename
except:
return None, None
@@ -623,10 +621,9 @@ def file_modification_days(filename) -> int:
return 9999
return days
-# 检查文件是否是链接
-def is_link(filename: str):
- if os.path.islink(filename):
- return True # symlink
- elif os.stat(filename).st_nlink > 1:
- return True # hard link Linux MAC OSX Windows NTFS
- return False
+def file_not_exist_or_empty(filepath) -> bool:
+ return not os.path.isfile(filepath) or os.path.getsize(filepath) == 0
+
+# 日语简单检测
+def is_japanese(s) -> bool:
+ return bool(re.search(r'[\u3040-\u309F\u30A0-\u30FF\uFF66-\uFF9F]', s, re.UNICODE))
diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py
index c1c7ee4b3..e87be0380 100755
--- a/AV_Data_Capture.py
+++ b/AV_Data_Capture.py
@@ -6,12 +6,13 @@
import shutil
import typing
import urllib3
+import signal
import config
from datetime import datetime, timedelta
import time
from pathlib import Path
-from ADC_function import file_modification_days, get_html, is_link
+from ADC_function import file_modification_days, get_html
from number_parser import get_number
from core import core_main, moveFailedFolder
@@ -35,30 +36,54 @@ def check_update(local_version):
def argparse_function(ver: str) -> typing.Tuple[str, str, bool]:
- parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+ conf = config.getInstance()
+ parser = argparse.ArgumentParser(epilog=f"Load Config file '{conf.ini_path}'.")
parser.add_argument("file", default='', nargs='?', help="Single Movie file path.")
parser.add_argument("-p","--path",default='',nargs='?',help="Analysis folder path.")
- # parser.add_argument("-c", "--config", default='config.ini', nargs='?', help="The config file Path.")
- default_logdir = os.path.join(Path.home(),'.avlogs')
+ parser.add_argument("-m","--main-mode",default='',nargs='?',help="Main mode. 1:Scraping 2:Organizing 3:Scraping in analysis folder")
+ parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number of single movie file.")
+ # parser.add_argument("-C", "--config", default='config.ini', nargs='?', help="The config file Path.")
+ default_logdir = str(Path.home() / '.avlogs')
parser.add_argument("-o","--log-dir",dest='logdir',default=default_logdir,nargs='?',
- help=f"""Duplicate stdout and stderr to logfiles
-in logging folder, default on.
-default for current user: {default_logdir}
-Use --log-dir= to turn off logging feature.""")
- parser.add_argument("-n", "--number", default='', nargs='?', help="Custom file number")
- parser.add_argument("-a", "--auto-exit", dest='autoexit', action="store_true",
- help="Auto exit after program complete")
+ help=f"""Duplicate stdout and stderr to logfiles in logging folder, default on.
+ default folder for current user: '{default_logdir}'. Change default folder to an empty file,
+ or use --log-dir= to turn log off.""")
parser.add_argument("-q","--regex-query",dest='regexstr',default='',nargs='?',help="python re module regex filepath filtering.")
+ parser.add_argument("-d","--nfo-skip-days",dest='days',default='',nargs='?', help="Override nfo_skip_days value in config.")
+ parser.add_argument("-c","--stop-counter",dest='cnt',default='',nargs='?', help="Override stop_counter value in config.")
+ parser.add_argument("-i", "--ignore-failed-list", action="store_true", help="Ignore failed list '{}'".format(
+ os.path.join(os.path.abspath(conf.failed_folder()), 'failed_list.txt')))
+ parser.add_argument("-a", "--auto-exit", action="store_true",
+ help="Auto exit after program complete")
+ parser.add_argument("-g","--debug", action="store_true",
+ help="Turn on debug mode to generate diagnostic log for issue report.")
+ parser.add_argument("-z","--zero-operation",dest='zero_op', action="store_true",
+ help="""Only show job list of files and numbers, and **NO** actual operation
+is performed. It may help you correct wrong numbers before real job.""")
parser.add_argument("-v", "--version", action="version", version=ver)
+ #ini_path
args = parser.parse_args()
-
- return args.file, args.path, args.number, args.autoexit, args.logdir, args.regexstr
-
+ def get_natural_number_or_none(value):
+ return int(value) if isinstance(value, str) and value.isnumeric() and int(value)>=0 else None
+ def get_str_or_none(value):
+ return value if isinstance(value, str) and len(value) else None
+ def get_bool_or_none(value):
+ return True if isinstance(value, bool) and value else None
+ config.G_conf_override["common:main_mode"] = get_natural_number_or_none(args.main_mode)
+ config.G_conf_override["common:source_folder"] = get_str_or_none(args.path)
+ config.G_conf_override["common:auto_exit"] = get_bool_or_none(args.auto_exit)
+ config.G_conf_override["common:nfo_skip_days"] = get_natural_number_or_none(args.days)
+ config.G_conf_override["common:stop_counter"] = get_natural_number_or_none(args.cnt)
+ config.G_conf_override["common:ignore_failed_list"] = get_bool_or_none(args.ignore_failed_list)
+ config.G_conf_override["debug_mode:switch"] = get_bool_or_none(args.debug)
+
+ return args.file, args.number, args.logdir, args.regexstr, args.zero_op
class OutLogger(object):
def __init__(self, logfile) -> None:
self.term = sys.stdout
self.log = open(logfile,"w",encoding='utf-8',buffering=1)
+ self.filepath = logfile
def __del__(self):
self.close()
def __enter__(self):
@@ -85,6 +110,7 @@ class ErrLogger(OutLogger):
def __init__(self, logfile) -> None:
self.term = sys.stderr
self.log = open(logfile,"w",encoding='utf-8',buffering=1)
+ self.filepath = logfile
def close(self):
if self.term != None:
sys.stderr = self.term
@@ -97,14 +123,18 @@ def close(self):
def dupe_stdout_to_logfile(logdir: str):
if not isinstance(logdir, str) or len(logdir) == 0:
return
- if not os.path.isdir(logdir):
- os.makedirs(logdir)
- if not os.path.isdir(logdir):
- return
-
+ log_dir = Path(logdir)
+ if not log_dir.exists():
+ try:
+ log_dir.mkdir(parents=True,exist_ok=True)
+ except:
+ pass
+ if not log_dir.is_dir():
+ return # Tips for disabling logs by change directory to a same name empty regular file
+ abslog_dir = log_dir.resolve()
log_tmstr = datetime.now().strftime("%Y%m%dT%H%M%S")
- logfile = os.path.join(logdir, f'avdc_{log_tmstr}.txt')
- errlog = os.path.join(logdir, f'avdc_{log_tmstr}_err.txt')
+ logfile = abslog_dir / f'avdc_{log_tmstr}.txt'
+ errlog = abslog_dir / f'avdc_{log_tmstr}_err.txt'
sys.stdout = OutLogger(logfile)
sys.stderr = ErrLogger(errlog)
@@ -113,28 +143,126 @@ def dupe_stdout_to_logfile(logdir: str):
def close_logfile(logdir: str):
if not isinstance(logdir, str) or len(logdir) == 0 or not os.path.isdir(logdir):
return
+ #日志关闭前保存日志路径
+ filepath = None
+ try:
+ filepath = sys.stdout.filepath
+ except:
+ pass
sys.stdout.close()
sys.stderr.close()
+ log_dir = Path(logdir).resolve()
+ if isinstance(filepath, Path):
+ print(f"Log file '{filepath}' saved.")
+ assert(filepath.parent.samefile(log_dir))
# 清理空文件
- for current_dir, subdirs, files in os.walk(logdir, topdown=False):
+ for f in log_dir.glob(r'*_err.txt'):
+ if f.stat().st_size == 0:
+ try:
+ f.unlink(missing_ok=True)
+ except:
+ pass
+ # 合并日志 只检测日志目录内的文本日志,忽略子目录。三天前的日志,按日合并为单个日志,三个月前的日志,
+ # 按月合并为单个月志,去年及以前的月志,今年4月以后将之按年合并为年志
+ # 测试步骤:
+ """
+ LOGDIR=/tmp/avlog
+ mkdir -p $LOGDIR
+ for f in {2016..2020}{01..12}{01..28};do;echo $f>$LOGDIR/avdc_${f}T235959.txt;done
+ for f in {01..09}{01..28};do;echo 2021$f>$LOGDIR/avdc_2021${f}T235959.txt;done
+ for f in {00..23};do;echo 20211001T$f>$LOGDIR/avdc_20211001T${f}5959.txt;done
+ echo "$(ls -1 $LOGDIR|wc -l) files in $LOGDIR"
+ # 1932 files in /tmp/avlog
+ avdc -zgic1 -d0 -m3 -o $LOGDIR
+ # python3 ./AV_Data_Capture.py -zgic1 -o $LOGDIR
+ ls $LOGDIR
+ # rm -rf $LOGDIR
+ """
+ today = datetime.today()
+ # 第一步,合并到日。3天前的日志,文件名是同一天的合并为一份日志
+ for i in range(1):
+ txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}T\d{6}$', f.stem, re.A)]
+ if not txts or not len(txts):
+ break
+ e = [f for f in txts if '_err' in f.stem]
+ txts.sort()
+ tmstr_3_days_ago = (today.replace(hour=0) - timedelta(days=3)).strftime("%Y%m%dT99")
+ deadline_day = f'avdc_{tmstr_3_days_ago}'
+ day_merge = [f for f in txts if f.stem < deadline_day]
+ if not day_merge or not len(day_merge):
+ break
+ cutday = len('T235959.txt') # cut length avdc_20201201|T235959.txt
+ for f in day_merge:
+ try:
+ day_file_name = str(f)[:-cutday] + '.txt' # avdc_20201201.txt
+ with open(day_file_name, 'a', encoding='utf-8') as m:
+ m.write(f.read_text(encoding='utf-8'))
+ f.unlink(missing_ok=True)
+ except:
+ pass
+ # 第二步,合并到月
+ for i in range(1): # 利用1次循环的break跳到第二步,避免大块if缩进或者使用goto语法
+ txts = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{8}$', f.stem, re.A)]
+ if not txts or not len(txts):
+ break
+ txts.sort()
+ tmstr_3_month_ago = (today.replace(day=1) - timedelta(days=3*30)).strftime("%Y%m32")
+ deadline_month = f'avdc_{tmstr_3_month_ago}'
+ month_merge = [f for f in txts if f.stem < deadline_month]
+ if not month_merge or not len(month_merge):
+ break
+ tomonth = len('01.txt') # cut length avdc_202012|01.txt
+ for f in month_merge:
+ try:
+ month_file_name = str(f)[:-tomonth] + '.txt' # avdc_202012.txt
+ with open(month_file_name, 'a', encoding='utf-8') as m:
+ m.write(f.read_text(encoding='utf-8'))
+ f.unlink(missing_ok=True)
+ except:
+ pass
+ # 第三步,月合并到年
+ if today.month < 4:
+ return
+ mons = [f for f in log_dir.glob(r'*.txt') if re.match(r'^avdc_\d{6}$', f.stem, re.A)]
+ if not mons or not len(mons):
+ return
+ mons.sort()
+ deadline_year = f'avdc_{today.year-1}13'
+ year_merge = [f for f in mons if f.stem < deadline_year]
+ if not year_merge or not len(year_merge):
+ return
+ toyear = len('12.txt') # cut length avdc_2020|12.txt
+ for f in year_merge:
try:
- for f in files:
- full_name = os.path.join(current_dir, f)
- if os.path.getsize(full_name) == 0:
- os.remove(full_name)
+ year_file_name = str(f)[:-toyear] + '.txt' # avdc_2020.txt
+ with open(year_file_name, 'a', encoding='utf-8') as y:
+ y.write(f.read_text(encoding='utf-8'))
+ f.unlink(missing_ok=True)
except:
pass
+ # 第四步,压缩年志 如果有压缩需求,请自行手工压缩,或者使用外部脚本来定时完成。推荐nongnu的lzip,对于
+ # 这种粒度的文本日志,压缩比是目前最好的。lzip -9的运行参数下,日志压缩比要高于xz -9,而且内存占用更少,
+ # 多核利用率更高(plzip多线程版本),解压速度更快。压缩后的大小差不多是未压缩时的2.4%到3.7%左右,
+ # 100MB的日志文件能缩小到3.7MB。
+
+
+def signal_handler(*args):
+ print('[!]Ctrl+C detected, Exit.')
+ sys.exit(9)
+
+def sigdebug_handler(*args):
+ config.G_conf_override["debug_mode:switch"] = not config.G_conf_override["debug_mode:switch"]
+ print('[!]Debug {}'.format('On' if config.getInstance().debug() else 'oFF'))
-# 重写视频文件扫描,消除递归,取消全局变量,新增失败文件列表跳过处理
-def movie_lists(root, conf, regexstr):
- escape_folder = re.split("[,,]", conf.escape_folder())
+# 新增失败文件列表跳过处理,及.nfo修改天数跳过处理,提示跳过视频总数,调试模式(-g)下详细被跳过文件,跳过小广告
+def movie_lists(source_folder, regexstr):
+ conf = config.getInstance()
main_mode = conf.main_mode()
debug = conf.debug()
nfo_skip_days = conf.nfo_skip_days()
soft_link = conf.soft_link()
- total = []
- file_type = conf.media_type().upper().split(",")
+ file_type = conf.media_type().lower().split(",")
trailerRE = re.compile(r'-trailer\.', re.IGNORECASE)
cliRE = None
if isinstance(regexstr, str) and len(regexstr):
@@ -142,72 +270,94 @@ def movie_lists(root, conf, regexstr):
cliRE = re.compile(regexstr, re.IGNORECASE)
except:
pass
+ failed_list_txt_path = Path(conf.failed_folder()).resolve() / 'failed_list.txt'
failed_set = set()
- if main_mode == 3 or soft_link:
+ if (main_mode == 3 or soft_link) and not conf.ignore_failed_list():
try:
- with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'r', encoding='utf-8') as flt:
- flist = flt.read().splitlines()
- failed_set = set(flist)
- flt.close()
- if len(flist) != len(failed_set):
- with open(os.path.join(conf.failed_folder(), 'failed_list.txt'), 'w', encoding='utf-8') as flt:
- flt.writelines([line + '\n' for line in failed_set])
- flt.close()
+ flist = failed_list_txt_path.read_text(encoding='utf-8').splitlines()
+ failed_set = set(flist)
+ if len(flist) != len(failed_set): # 检查去重并写回,但是不改变failed_list.txt内条目的先后次序,重复的只保留最后的
+ fset = failed_set.copy()
+ for i in range(len(flist)-1, -1, -1):
+ fset.remove(flist[i]) if flist[i] in fset else flist.pop(i)
+ failed_list_txt_path.write_text('\n'.join(flist) + '\n', encoding='utf-8')
+ assert len(fset) == 0 and len(flist) == len(failed_set)
except:
pass
- for current_dir, subdirs, files in os.walk(root, topdown=False):
- if len(set(current_dir.replace("\\","/").split("/")) & set(escape_folder)) > 0:
+ if not Path(source_folder).is_dir():
+ print('[-]Source folder not found!')
+ return []
+ total = []
+ source = Path(source_folder).resolve()
+ skip_failed_cnt, skip_nfo_days_cnt = 0, 0
+ escape_folder_set = set(re.split("[,,]", conf.escape_folder()))
+ for full_name in source.glob(r'**/*'):
+ if main_mode != 3 and set(full_name.parent.parts) & escape_folder_set:
+ continue
+ if not full_name.suffix.lower() in file_type:
+ continue
+ absf = str(full_name)
+ if absf in failed_set:
+ skip_failed_cnt += 1
+ if debug:
+ print('[!]Skip failed movie:', absf)
+ continue
+ is_sym = full_name.is_symlink()
+ if main_mode != 3 and (is_sym or full_name.stat().st_nlink > 1): # 短路布尔 符号链接不取stat(),因为符号链接可能指向不存在目标
+ continue # file is symlink or hardlink(Linux/NTFS/Darwin)
+ # 调试用0字节样本允许通过,去除小于120MB的广告'苍老师强力推荐.mp4'(102.2MB)'黑道总裁.mp4'(98.4MB)'有趣的妹子激情表演.MP4'(95MB)'有趣的臺灣妹妹直播.mp4'(15.1MB)
+ movie_size = 0 if is_sym else full_name.stat().st_size # 同上 符号链接不取stat()及st_size,直接赋0跳过小视频检测
+ if movie_size > 0 and movie_size < 125829120: # 1024*1024*120=125829120
+ continue
+ if cliRE and not cliRE.search(absf) or trailerRE.search(full_name.name):
+ continue
+ if main_mode == 3 and nfo_skip_days > 0 and file_modification_days(full_name.with_suffix('.nfo')) <= nfo_skip_days:
+ skip_nfo_days_cnt += 1
+ if debug:
+ print(f"[!]Skip movie by it's .nfo which modified within {nfo_skip_days} days: '{absf}'")
continue
- for f in files:
- full_name = os.path.join(current_dir, f)
- if not os.path.splitext(full_name)[1].upper() in file_type:
- continue
- absf = os.path.abspath(full_name)
- if absf in failed_set:
- if debug:
- print('[!]Skip failed file:', absf)
- continue
- if cliRE and not cliRE.search(absf):
- continue
- if main_mode == 3 and nfo_skip_days > 0:
- nfo = Path(absf).with_suffix('.nfo')
- if file_modification_days(nfo) <= nfo_skip_days:
- continue
- if (main_mode == 3 or not is_link(absf)) and not trailerRE.search(f):
- total.append(absf)
+ total.append(absf)
+
+ if skip_failed_cnt:
+ print(f"[!]Skip {skip_failed_cnt} movies in failed list '{failed_list_txt_path}'.")
+ if skip_nfo_days_cnt:
+ print(f"[!]Skip {skip_nfo_days_cnt} movies in source folder '{source}' who's .nfo modified within {nfo_skip_days} days.")
if nfo_skip_days <= 0 or not soft_link or main_mode == 3:
return total
# 软连接方式,已经成功削刮的也需要从成功目录中检查.nfo更新天数,跳过N天内更新过的
skip_numbers = set()
- success_folder = conf.success_folder()
- for current_dir, subdirs, files in os.walk(success_folder, topdown=False):
- for f in files:
- f_obj = Path(f)
- if f_obj.suffix.lower() != '.nfo':
- continue
- if file_modification_days(Path(current_dir) / f_obj) > nfo_skip_days:
- continue
- number = get_number(False, f_obj.stem)
- if number:
- skip_numbers.add(number.upper())
+ success_folder = Path(conf.success_folder()).resolve()
+ for f in success_folder.glob(r'**/*'):
+ if not re.match(r'\.nfo', f.suffix, re.IGNORECASE):
+ continue
+ if file_modification_days(f) > nfo_skip_days:
+ continue
+ number = get_number(False, f.stem)
+ if not number:
+ continue
+ skip_numbers.add(number.lower())
+
rm_list = []
for f in total:
n_number = get_number(False, os.path.basename(f))
- if n_number and n_number.upper() in skip_numbers:
+ if n_number and n_number.lower() in skip_numbers:
rm_list.append(f)
for f in rm_list:
total.remove(f)
+ if debug:
+ print(f"[!]Skip file successfully processed within {nfo_skip_days} days: '{f}'")
+ if len(rm_list):
+ print(f"[!]Skip {len(rm_list)} movies in success folder '{success_folder}' who's .nfo modified within {nfo_skip_days} days.")
+
return total
def create_failed_folder(failed_folder):
- if not os.path.isdir(failed_folder): # 新建failed文件夹
+ if not os.path.exists(failed_folder): # 新建failed文件夹
try:
os.makedirs(failed_folder)
- if not os.path.isdir(failed_folder):
- raise
except:
- print("[-]failed!can not be make folder 'failed'\n[-](Please run as Administrator)")
+ print(f"[-]Fatal error! Can not make folder '{failed_folder}'")
sys.exit(0)
@@ -227,24 +377,29 @@ def rm_empty_folder(path):
pass
-def create_data_and_move(file_path: str, c: config.Config, debug):
+def create_data_and_move(file_path: str, zero_op):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
- file_name = os.path.basename(file_path)
- n_number = get_number(debug, file_name)
+ debug = config.getInstance().debug()
+ n_number = get_number(debug, os.path.basename(file_path))
file_path = os.path.abspath(file_path)
if debug == True:
- print(f"[!]Making Data for [{file_path}], the number is [{n_number}]")
+ print(f"[!] [{n_number}] As Number making data for '{file_path}'")
+ if zero_op:
+ return
if n_number:
- core_main(file_path, n_number, c)
+ core_main(file_path, n_number)
else:
print("[-] number empty ERROR")
+ moveFailedFolder(file_path)
print("[*]======================================================")
else:
try:
- print(f"[!]Making Data for [{file_path}], the number is [{n_number}]")
+ print(f"[!] [{n_number}] As Number making data for '{file_path}'")
+ if zero_op:
+ return
if n_number:
- core_main(file_path, n_number, c)
+ core_main(file_path, n_number)
else:
raise ValueError("number empty")
print("[*]======================================================")
@@ -253,22 +408,26 @@ def create_data_and_move(file_path: str, c: config.Config, debug):
print('[-]', err)
try:
- moveFailedFolder(file_path, conf)
+ moveFailedFolder(file_path)
except Exception as err:
print('[!]', err)
-def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number):
+def create_data_and_move_with_custom_number(file_path: str, custom_number):
+ conf = config.getInstance()
file_name = os.path.basename(file_path)
try:
- print("[!]Making Data for [{}], the number is [{}]".format(file_path, custom_number))
- core_main(file_path, custom_number, c)
+ print("[!] [{1}] As Number making data for '{0}'".format(file_path, custom_number))
+ if custom_number:
+ core_main(file_path, custom_number)
+ else:
+ print("[-] number empty ERROR")
print("[*]======================================================")
except Exception as err:
print("[-] [{}] ERROR:".format(file_path))
print('[-]', err)
- if c.soft_link():
+ if conf.soft_link():
print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, os.path.join(conf.failed_folder(), file_name))
else:
@@ -279,12 +438,26 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
print('[!]', err)
-if __name__ == '__main__':
+def main():
version = '5.0.1'
urllib3.disable_warnings() #Ignore http proxy warning
+
+ # Read config.ini first, in argparse_function() need conf.failed_folder()
+ conf = config.Config("config.ini")
+
# Parse command line args
- single_file_path, folder_path, custom_number, auto_exit, logdir, regexstr = argparse_function(version)
+ single_file_path, custom_number, logdir, regexstr, zero_op = argparse_function(version)
+ main_mode = conf.main_mode()
+ if not main_mode in (1, 2, 3):
+ print(f"[-]Main mode must be 1 or 2 or 3! You can run '{os.path.basename(sys.argv[0])} --help' for more help.")
+ sys.exit(4)
+
+ signal.signal(signal.SIGINT, signal_handler)
+ if sys.platform == 'win32':
+ signal.signal(signal.SIGBREAK, sigdebug_handler)
+ else:
+ signal.signal(signal.SIGWINCH, sigdebug_handler)
dupe_stdout_to_logfile(logdir)
print('[*]================== AV Data Capture ===================')
@@ -293,55 +466,62 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
print('[*]======================================================')
print('[*]严禁在墙内宣传本项目')
- # Read config.ini
- conf = config.Config("config.ini")
-
+ start_time = time.time()
+ print('[+]Start at', time.strftime("%Y-%m-%d %H:%M:%S"))
if conf.update_check():
check_update(version)
+ print(f"[+]Load Config file '{conf.ini_path}'.")
if conf.debug():
print('[+]Enable debug')
if conf.soft_link():
print('[!]Enable soft link')
- #print('[!]CmdLine:'," ".join(sys.argv[1:]))
+ if len(sys.argv)>1:
+ print('[!]CmdLine:'," ".join(sys.argv[1:]))
+ print('[+]Main Working mode ## {}: {} ## {}{}{}'
+ .format(*(main_mode, ['Scraping', 'Organizing', 'Scraping in analysis folder'][main_mode-1],
+ "" if not conf.multi_threading() else ", multi_threading on",
+ "" if conf.nfo_skip_days() == 0 else f", nfo_skip_days={conf.nfo_skip_days()}",
+ "" if conf.stop_counter() == 0 else f", stop_counter={conf.stop_counter()}"
+ ) if not single_file_path else ('-','Single File', '','',''))
+ )
create_failed_folder(conf.failed_folder())
- start_time = time.time()
if not single_file_path == '': #Single File
print('[+]==================== Single File =====================')
if custom_number == '':
- create_data_and_move_with_custom_number(single_file_path, conf, get_number(conf.debug(), os.path.basename(single_file_path)))
+ create_data_and_move_with_custom_number(single_file_path, get_number(conf.debug(), os.path.basename(single_file_path)))
else:
- create_data_and_move_with_custom_number(single_file_path, conf, custom_number)
+ create_data_and_move_with_custom_number(single_file_path, custom_number)
else:
- if folder_path == '':
+ folder_path = conf.source_folder()
+ if not isinstance(folder_path, str) or folder_path == '':
folder_path = os.path.abspath(".")
- movie_list = movie_lists(folder_path, conf, regexstr)
+ movie_list = movie_lists(folder_path, regexstr)
count = 0
count_all = str(len(movie_list))
- print('[+]Find', count_all, 'movies. Start at', time.strftime("%Y-%m-%d %H:%M:%S"))
- main_mode = conf.main_mode()
+ print('[+]Find', count_all, 'movies.')
+ print('[*]======================================================')
stop_count = conf.stop_counter()
if stop_count<1:
stop_count = 999999
else:
count_all = str(min(len(movie_list), stop_count))
- if main_mode == 3:
- print(f'[!]运行模式:**维护模式**,本程序将在处理{count_all}个视频文件后停止,如需后台执行自动退出请结合 -a 参数。')
+
for movie_path in movie_list: # 遍历电影列表 交给core处理
count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%'
- print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
- create_data_and_move(movie_path, conf, conf.debug())
+ print('[!] {:>30}{:>21}'.format('- ' + percentage + ' [' + str(count) + '/' + count_all + '] -', time.strftime("%H:%M:%S")))
+ create_data_and_move(movie_path, zero_op)
if count >= stop_count:
print("[!]Stop counter triggered!")
break
- if conf.del_empty_folder():
+ if conf.del_empty_folder() and not zero_op:
rm_empty_folder(conf.success_folder())
rm_empty_folder(conf.failed_folder())
if len(folder_path):
@@ -353,9 +533,15 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
" End at", time.strftime("%Y-%m-%d %H:%M:%S"))
print("[+]All finished!!!")
- if not (conf.auto_exit() or auto_exit):
- input("Press enter key exit, you can check the error message before you exit...")
close_logfile(logdir)
+ if not conf.auto_exit():
+ input("Press enter key exit, you can check the error message before you exit...")
+
sys.exit(0)
+
+import multiprocessing
+if __name__ == '__main__':
+ multiprocessing.freeze_support()
+ main()
diff --git a/Makefile b/Makefile
index 407aa4b00..4c8960aa8 100644
--- a/Makefile
+++ b/Makefile
@@ -16,7 +16,9 @@ make:
#export cloudscraper_path=$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1)
@echo "[+]Pyinstaller make"
- pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "Img:Img"
+ pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
+ --add-data "Img:Img" \
+ --add-data "config.ini:." \
@echo "[+]Move to bin"
if [ ! -d "./bin" ];then mkdir bin; fi
diff --git a/WebCrawler/__init__.py b/WebCrawler/__init__.py
index e1608b661..039fed0c5 100644
--- a/WebCrawler/__init__.py
+++ b/WebCrawler/__init__.py
@@ -32,7 +32,7 @@ def get_data_state(data: dict) -> bool: # 元数据获取失败检测
return True
-def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数据
+def get_data_from_json(file_number): # 从JSON返回元数据
"""
iterate through all services and fetch the data
"""
@@ -53,6 +53,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
"fc2club": fc2club.main
}
+ conf = config.getInstance()
# default fetch order list, from the beginning to the end
sources = conf.sources().split(',')
if not len(conf.sources()) > 80:
@@ -114,6 +115,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
json_data = json.loads(pool.apply_async(func_mapping[source], (file_number,)).get())
# if any service return a valid return, break
if get_data_state(json_data):
+ print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break
pool.close()
pool.terminate()
@@ -125,6 +127,7 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
json_data = json.loads(func_mapping[source](file_number))
# if any service return a valid return, break
if get_data_state(json_data):
+ print(f"[+]Find movie [{file_number}] metadata on website '{source}'")
break
except:
break
@@ -134,6 +137,14 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
print('[-]Movie Number not found!')
return None
+ # 增加number严格判断,避免提交任何number,总是返回"本橋実来 ADZ335",这种返回number不一致的数据源故障
+ # 目前选用number命名规则是javdb.com Domain Creation Date: 2013-06-19T18:34:27Z
+ # 然而也可以跟进关注其它命名规则例如airav.wiki Domain Creation Date: 2019-08-28T07:18:42.0Z
+ # 如果将来javdb.com命名规则下不同Studio出现同名碰撞导致无法区分,可考虑更换规则,更新相应的number分析和抓取代码。
+ if str(json_data.get('number')).upper() != file_number.upper():
+ print('[-]Movie number has changed! [{}]->[{}]'.format(file_number, str(json_data.get('number'))))
+ return None
+
# ================================================网站规则添加结束================================================
title = json_data.get('title')
@@ -167,6 +178,10 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
imagecut = json_data.get('imagecut')
tag = str(json_data.get('tag')).strip("[ ]").replace("'", '').replace(" ", '').split(',') # 字符串转列表 @
+ while 'XXXX' in tag:
+ tag.remove('XXXX')
+ while 'xxx' in tag:
+ tag.remove('xxx')
actor = str(actor_list).strip("[ ]").replace("'", '').replace(" ", '')
if title == '' or number == '':
@@ -225,6 +240,8 @@ def get_data_from_json(file_number, conf: config.Config): # 从JSON返回元数
studio = studio.replace('エムズビデオグループ','M’s Video Group')
studio = studio.replace('ミニマム','Minimum')
studio = studio.replace('ワープエンタテインメント','WAAP Entertainment')
+ studio = studio.replace('pacopacomama,パコパコママ','pacopacomama')
+ studio = studio.replace('パコパコママ','pacopacomama')
studio = re.sub('.*/妄想族','妄想族',studio)
studio = studio.replace('/',' ')
# === 替换Studio片假名 END
@@ -293,4 +310,7 @@ def special_characters_replacement(text) -> str:
replace('"', '"'). # U+FF02 FULLWIDTH QUOTATION MARK @ Basic Multilingual Plane
replace('<', 'ᐸ'). # U+1438 CANADIAN SYLLABICS PA @ Basic Multilingual Plane
replace('>', 'ᐳ'). # U+1433 CANADIAN SYLLABICS PO @ Basic Multilingual Plane
- replace('|', 'ǀ')) # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
+ replace('|', 'ǀ'). # U+01C0 LATIN LETTER DENTAL CLICK @ Basic Multilingual Plane
+ replace('‘', '‘'). # U+02018 LEFT SINGLE QUOTATION MARK
+ replace('’', '’'). # U+02019 RIGHT SINGLE QUOTATION MARK
+ replace('&', '&'))
diff --git a/WebCrawler/airav.py b/WebCrawler/airav.py
index 59254217f..f7b144ce8 100644
--- a/WebCrawler/airav.py
+++ b/WebCrawler/airav.py
@@ -6,6 +6,7 @@
from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
+from WebCrawler import javbus
'''
API
@@ -17,95 +18,94 @@
host = 'https://www.airav.wiki'
# airav这个网站没有演员图片,所以直接使用javbus的图
-def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
- soup = BeautifulSoup(htmlcode, 'lxml')
- a = soup.find_all(attrs={'class': 'star-name'})
- d={}
- for i in a:
- l=i.a['href']
- t=i.get_text()
- html = etree.fromstring(get_html(l), etree.HTMLParser())
- p=urljoin("https://www.javbus.com",
- str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
- p2={t:p}
- d.update(p2)
- return d
+def getActorPhoto(javbus_json):
+ result = javbus_json.get('actor_photo')
+ if isinstance(result, dict) and len(result):
+ return result
+ return ''
def getTitle(htmlcode): #获取标题
- doc = pq(htmlcode)
- # h5:first-child定位第一个h5标签,妈的找了好久才找到这个语法
- title = str(doc('div.d-flex.videoDataBlock h5.d-none.d-md-block:nth-child(2)').text()).replace(' ', '-')
- try:
- title2 = re.sub('n\d+-','',title)
-
- return title2
- except:
- return title
-
-def getStudio(htmlcode): #获取厂商 已修改
- html = etree.fromstring(htmlcode,etree.HTMLParser())
- # 如果记录中冇导演,厂商排在第4位
- if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
- # 如果记录中有导演,厂商排在第5位
- elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
- else:
- result = ''
+ html = etree.fromstring(htmlcode, etree.HTMLParser())
+ title = str(html.xpath('/html/head/title/text()')[0])
+ result = str(re.findall('](.*?)- AIRAV-WIKI', title)[0]).strip()
return result
-def getYear(htmlcode): #获取年份
+
+def getStudio(htmlcode, javbus_json): #获取厂商 已修改
+ # javbus如果有数据以它为准
+ result = javbus_json.get('studio')
+ if isinstance(result, str) and len(result):
+ return result
html = etree.fromstring(htmlcode,etree.HTMLParser())
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
- return result
-def getCover(htmlcode): #获取封面链接
- doc = pq(htmlcode)
- image = doc('a.bigImage')
- return urljoin("https://www.javbus.com", image.attr('href'))
-def getRelease(htmlcode): #获取出版日期
+ return str(html.xpath('//a[contains(@href,"?video_factory=")]/text()')).strip(" ['']")
+def getYear(htmlcode, javbus_json): #获取年份
+ result = javbus_json.get('year')
+ if isinstance(result, str) and len(result):
+ return result
+ release = getRelease(htmlcode, javbus_json)
+ if len(release) != len('2000-01-01'):
+ return ''
+ return release[:4]
+def getCover(htmlcode, javbus_json): #获取封面图片
+ result = javbus_json.get('cover')
+ if isinstance(result, str) and len(result):
+ return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
- return result
-def getRuntime(htmlcode): #获取分钟 已修改
+ return html.xpath('//img[contains(@src,"/storage/big_pic/")]/@src')[0]
+def getRelease(htmlcode, javbus_json): #获取出版日期
+ result = javbus_json.get('release')
+ if isinstance(result, str) and len(result):
+ return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
+ try:
+ result = re.search(r'\d{4}-\d{2}-\d{2}', str(html.xpath('//li[contains(text(),"發片日期")]/text()'))).group()
+ except:
+ return ''
return result
-def getActor(htmlcode): #获取女优
+def getRuntime(javbus_json): #获取播放时长
+ result = javbus_json.get('runtime')
+ if isinstance(result, str) and len(result):
+ return result
+ return ''
+# airav女优数据库较多日文汉字姓名,javbus较多日语假名,因此airav优先
+def getActor(htmlcode, javbus_json): #获取女优
b=[]
- soup=BeautifulSoup(htmlcode,'lxml')
- a=soup.find_all(attrs={'class':'star-name'})
- for i in a:
- b.append(i.get_text())
- return b
-def getNum(htmlcode): #获取番号
html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
- return result
-def getDirector(htmlcode): #获取导演 已修改
+ a = html.xpath('//ul[@class="videoAvstarList"]/li/a[starts-with(@href,"/idol/")]/text()')
+ for v in a:
+ v = v.strip()
+ if len(v):
+ b.append(v)
+ if len(b):
+ return b
+ result = javbus_json.get('actor')
+ if isinstance(result, list) and len(result):
+ return result
+ return []
+def getNum(htmlcode, javbus_json): #获取番号
+ result = javbus_json.get('number')
+ if isinstance(result, str) and len(result):
+ return result
html = etree.fromstring(htmlcode, etree.HTMLParser())
- if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
- else:
- result = '' # 记录中有可能没有导演数据
+ title = str(html.xpath('/html/head/title/text()')[0])
+ result = str(re.findall('^\[(.*?)]', title)[0])
return result
-
-def getOutline(htmlcode): #获取演员
+def getDirector(javbus_json): #获取导演 已修改
+ result = javbus_json.get('director')
+ if isinstance(result, str) and len(result):
+ return result
+ return ''
+def getOutline(htmlcode): #获取概述
html = etree.fromstring(htmlcode, etree.HTMLParser())
try:
- result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','')
+ result = html.xpath("string(//div[@class='d-flex videoDataBlock']/div[@class='synopsis']/p)").replace('\n','').strip()
return result
except:
return ''
-def getSerise(htmlcode): #获取系列 已修改
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- # 如果记录中冇导演,系列排在第6位
- if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
- # 如果记录中有导演,系列排在第7位
- elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
- else:
- result = ''
- return result
+def getSerise(javbus_json): #获取系列 已修改
+ result = javbus_json.get('series')
+ if isinstance(result, str) and len(result):
+ return result
+ return ''
def getTag(htmlcode): # 获取标签
tag = []
soup = BeautifulSoup(htmlcode, 'lxml')
@@ -169,52 +169,50 @@ def main(number):
try:
try:
htmlcode = get_html('https://cn.airav.wiki/video/' + number)
- javbus_htmlcode = get_html('https://www.javbus.com/ja/' + number)
-
+ javbus_json = json.loads(javbus.main(number))
except:
print(number)
dic = {
# 标题可使用airav
- 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
- # 制作商选择使用javbus
- 'studio': getStudio(javbus_htmlcode),
- # 年份也是用javbus
- 'year': str(re.search('\d{4}', getYear(javbus_htmlcode)).group()),
+ 'title': getTitle(htmlcode),
+ # 制作商先找javbus,如果没有再找本站
+ 'studio': getStudio(htmlcode, javbus_json),
+ # 年份先试javbus,如果没有再找本站
+ 'year': getYear(htmlcode, javbus_json),
# 简介 使用 airav
'outline': getOutline(htmlcode),
# 使用javbus
- 'runtime': getRuntime(javbus_htmlcode),
+ 'runtime': getRuntime(javbus_json),
# 导演 使用javbus
- 'director': getDirector(javbus_htmlcode),
- # 作者 使用airav
- 'actor': getActor(javbus_htmlcode),
- # 发售日使用javbus
- 'release': getRelease(javbus_htmlcode),
+ 'director': getDirector(javbus_json),
+ # 演员 先试airav
+ 'actor': getActor(htmlcode, javbus_json),
+ # 发售日先试javbus
+ 'release': getRelease(htmlcode, javbus_json),
# 番号使用javbus
- 'number': getNum(javbus_htmlcode),
+ 'number': getNum(htmlcode, javbus_json),
# 封面链接 使用javbus
- 'cover': getCover(javbus_htmlcode),
+ 'cover': getCover(htmlcode, javbus_json),
# 剧照获取
'extrafanart': getExtrafanart(htmlcode),
'imagecut': 1,
# 使用 airav
'tag': getTag(htmlcode),
# 使用javbus
- 'label': getSerise(javbus_htmlcode),
+ 'label': getSerise(javbus_json),
# 妈的,airav不提供作者图片
- 'actor_photo': getActorPhoto(javbus_htmlcode),
-
+# 'actor_photo': getActorPhoto(javbus_json),
'website': 'https://www.airav.wiki/video/' + number,
'source': 'airav.py',
# 使用javbus
- 'series': getSerise(javbus_htmlcode),
+ 'series': getSerise(javbus_json)
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
- if config.Config().debug():
+ if config.getInstance().debug():
print(e)
data = {
"title": "",
@@ -226,6 +224,6 @@ def main(number):
if __name__ == '__main__':
- #print(main('ADN-188'))
- print(main('ADN-188'))
- print(main('CJOD-278'))
+ print(main('ADV-R0624')) # javbus页面返回404, airav有数据
+ print(main('ADN-188')) # 一人
+ print(main('CJOD-278')) # 多人 javbus演员名称采用日语假名,airav采用日文汉字
diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py
index 254f3e887..e38a452d5 100644
--- a/WebCrawler/avsox.py
+++ b/WebCrawler/avsox.py
@@ -3,50 +3,42 @@
import re
from lxml import etree
import json
-from bs4 import BeautifulSoup
from ADC_function import *
-# import sys
+from WebCrawler.storyline import getStoryline
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
-def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
- soup = BeautifulSoup(htmlcode, 'lxml')
- a = soup.find_all(attrs={'class': 'avatar-box'})
+def getActorPhoto(html):
+ a = html.xpath('//a[@class="avatar-box"]')
d = {}
for i in a:
- l = i.img['src']
- t = i.span.get_text()
+ l = i.find('.//img').attrib['src']
+ t = i.find('span').text
p2 = {t: l}
d.update(p2)
return d
-def getTitle(a):
+def getTitle(html):
try:
- html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '')
except:
return ''
-def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
- soup = BeautifulSoup(a, 'lxml')
- a = soup.find_all(attrs={'class': 'avatar-box'})
+def getActor(html):
+ a = html.xpath('//a[@class="avatar-box"]')
d = []
for i in a:
- d.append(i.span.get_text())
+ d.append(i.find('span').text)
return d
-def getStudio(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getStudio(html):
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1
-def getRuntime(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getRuntime(html):
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1
-def getLabel(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getLabel(html):
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1
-def getNum(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getNum(html):
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1
def getYear(release):
@@ -55,28 +47,20 @@ def getYear(release):
return result
except:
return release
-def getRelease(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getRelease(html):
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1
-def getCover(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getCover(html):
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result
-def getCover_small(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getCover_small(html):
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result
-def getTag(a): # 获取演员
- soup = BeautifulSoup(a, 'lxml')
- a = soup.find_all(attrs={'class': 'genre'})
- d = []
- for i in a:
- d.append(i.get_text())
- return d
-def getSeries(htmlcode):
+def getTag(html):
+ x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+ return x[2:] if len(x) > 2 else []
+def getSeries(html):
try:
- html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1
except:
@@ -86,42 +70,45 @@ def main(number):
html = get_html('https://tellme.pw/avsox')
site = etree.HTML(html).xpath('//div[@class="container"]/div/a/@href')[0]
a = get_html(site + '/cn/search/' + number)
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('-', '_'))
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html(site + '/cn/search/' + number.replace('_', ''))
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+ html = etree.fromstring(a, etree.HTMLParser())
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
- web = get_html("https:" + result1)
- soup = BeautifulSoup(web, 'lxml')
- info = str(soup.find(attrs={'class': 'row movie'}))
+ detail = get_html("https:" + result1)
+ lx = etree.fromstring(detail, etree.HTMLParser())
try:
+ new_number = getNum(lx)
+ if new_number.upper() != number.upper():
+ raise ValueError('number not found')
+ title = getTitle(lx).strip(new_number)
dic = {
- 'actor': getActor(web),
- 'title': getTitle(web).strip(getNum(web)),
- 'studio': getStudio(info),
- 'outline': '', #
- 'runtime': getRuntime(info),
+ 'actor': getActor(lx),
+ 'title': title,
+ 'studio': getStudio(lx),
+ 'outline': getStoryline(number, title),
+ 'runtime': getRuntime(lx),
'director': '', #
- 'release': getRelease(info),
- 'number': getNum(info),
- 'cover': getCover(web),
- 'cover_small': getCover_small(a),
+ 'release': getRelease(lx),
+ 'number': new_number,
+ 'cover': getCover(lx),
+ 'cover_small': getCover_small(html),
'imagecut': 3,
- 'tag': getTag(web),
- 'label': getLabel(info),
- 'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
- 'actor_photo': getActorPhoto(web),
+ 'tag': getTag(lx),
+ 'label': getLabel(lx),
+ 'year': getYear(getRelease(lx)),
+ 'actor_photo': getActorPhoto(lx),
'website': "https:" + result1,
'source': 'avsox.py',
- 'series': getSeries(info),
+ 'series': getSeries(lx),
}
except Exception as e:
- if config.Config().debug():
+ if config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -129,3 +116,4 @@ def main(number):
if __name__ == "__main__":
print(main('012717_472'))
+ print(main('1')) # got fake result raise 'number not found'
diff --git a/WebCrawler/carib.py b/WebCrawler/carib.py
index 8eee1aff6..790b91039 100755
--- a/WebCrawler/carib.py
+++ b/WebCrawler/carib.py
@@ -1,51 +1,56 @@
import sys
sys.path.append('../')
import json
-from bs4 import BeautifulSoup
from lxml import html
import re
from ADC_function import *
+from WebCrawler.storyline import getStoryline
def main(number: str) -> json:
try:
- caribbytes, browser = get_html_by_browser(
- 'https://www.caribbeancom.com/moviepages/'+number+'/index.html',
- return_type="browser")
-
- if not caribbytes or not caribbytes.ok:
+ # 因演员图片功能还未使用,为提速暂时注释,改为用get_html()
+ #r, browser = get_html_by_browser('https://www.caribbeancom.com/moviepages/'+number+'/index.html',
+ # return_type='browser')
+ #if not r.ok:
+ # raise ValueError("page not found")
+ #htmlcode = str(browser.page)
+ htmlbyte = get_html('https://www.caribbeancom.com/moviepages/'+number+'/index.html', return_type='content')
+ htmlcode = htmlbyte.decode('euc-jp')
+ if not htmlcode or '
404' in htmlcode or 'class="movie-info section"' not in htmlcode:
raise ValueError("page not found")
- lx = html.fromstring(str(browser.page))
+ lx = html.fromstring(htmlcode)
+ title = get_title(lx)
+
+ dic = {
+ 'title': title,
+ 'studio': '加勒比',
+ 'year': get_year(lx),
+ 'outline': get_outline(lx, number, title),
+ 'runtime': get_runtime(lx),
+ 'director': '',
+ 'actor': get_actor(lx),
+ 'release': get_release(lx),
+ 'number': number,
+ 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
+ 'tag': get_tag(lx),
+ 'extrafanart': get_extrafanart(lx),
+ 'label': get_series(lx),
+ 'imagecut': 1,
+# 'actor_photo': get_actor_photo(browser),
+ 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
+ 'source': 'carib.py',
+ 'series': get_series(lx),
+ }
+ js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
+ return js
- if not browser.page.select_one("#moviepages > div > div:nth-child(1) > div.movie-info.section"):
- raise ValueError("page info not found")
except Exception as e:
- if config.Config().debug():
+ if config.getInstance().debug():
print(e)
dic = {"title": ""}
return json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'))
- dic = {
- 'title': get_title(lx),
- 'studio': '加勒比',
- 'year': get_year(lx),
- 'outline': get_outline(lx),
- 'runtime': get_runtime(lx),
- 'director': '',
- 'actor': get_actor(lx),
- 'release': get_release(lx),
- 'number': number,
- 'cover': 'https://www.caribbeancom.com/moviepages/' + number + '/images/l_l.jpg',
- 'tag': get_tag(lx),
- 'extrafanart': get_extrafanart(lx),
- 'label': get_series(lx),
- 'imagecut': 1,
-# 'actor_photo': get_actor_photo(browser),
- 'website': 'https://www.caribbeancom.com/moviepages/' + number + '/index.html',
- 'source': 'carib.py',
- 'series': get_series(lx),
- }
- js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), )
- return js
+
def get_title(lx: html.HtmlElement) -> str:
return str(lx.xpath("//div[@class='movie-info section']/div[@class='heading']/h1[@itemprop='name']/text()")[0]).strip()
@@ -53,8 +58,12 @@ def get_title(lx: html.HtmlElement) -> str:
def get_year(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0][:4]
-def get_outline(lx: html.HtmlElement) -> str:
- return lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+def get_outline(lx: html.HtmlElement, number: str, title: str) -> str:
+ o = lx.xpath("//div[@class='movie-info section']/p[@itemprop='description']/text()")[0].strip()
+ g = getStoryline(number, title)
+ if len(g):
+ return g
+ return o
def get_release(lx: html.HtmlElement) -> str:
return lx.xpath("//li[2]/span[@class='spec-content']/text()")[0].replace('/','-')
@@ -114,11 +123,10 @@ def get_actor_photo(browser):
if pos<0:
continue
css = html[pos:pos+100]
- p0 = css.find('background: url(')
- p1 = css.find('.jpg)')
- if p0<0 or p1<0:
+ cssBGjpgs = re.findall(r'background: url\((.+\.jpg)', css, re.I)
+ if not cssBGjpgs or not len(cssBGjpgs[0]):
continue
- p = {k: urljoin(browser.url, css[p0+len('background: url('):p1+len('.jpg')])}
+ p = {k: urljoin(browser.url, cssBGjpgs[0])}
o.update(p)
return o
diff --git a/WebCrawler/dlsite.py b/WebCrawler/dlsite.py
index 066e04f80..d22cdb170 100644
--- a/WebCrawler/dlsite.py
+++ b/WebCrawler/dlsite.py
@@ -153,7 +153,7 @@ def main(number):
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except Exception as e:
- if config.Config().debug():
+ if config.getInstance().debug():
print(e)
data = {
"title": "",
diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py
index e6ae516ad..0a51fdc0b 100644
--- a/WebCrawler/fc2.py
+++ b/WebCrawler/fc2.py
@@ -93,10 +93,11 @@ def main(number):
actor = '素人'
lx = etree.fromstring(htmlcode2, etree.HTMLParser())
cover = str(lx.xpath("//div[@class='items_article_MainitemThumb']/span/img/@src")).strip(" ['']")
+ cover = ADC_function.urljoin('https://adult.contents.fc2.com', cover)
dic = {
'title': lx.xpath('/html/head/title/text()')[0],
'studio': getStudio_fc2com(htmlcode2),
- 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
+ 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': str(lx.xpath("//p[@class='items_article_info']/text()")[0]),
'director': getStudio_fc2com(htmlcode2),
@@ -116,7 +117,7 @@ def main(number):
'series': '',
}
except Exception as e:
- if ADC_function.config.Config().debug():
+ if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
@@ -124,4 +125,5 @@ def main(number):
if __name__ == '__main__':
print(main('FC2-1787685'))
+ print(main('FC2-2086710'))
diff --git a/WebCrawler/fc2club.py b/WebCrawler/fc2club.py
index 7d0fac6fb..df14b3b6a 100644
--- a/WebCrawler/fc2club.py
+++ b/WebCrawler/fc2club.py
@@ -84,7 +84,7 @@ def main(number):
dic = {
'title': getTitle_fc2com(htmlcode2),
'studio': getStudio_fc2com(htmlcode2),
- 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
+ 'year': getYear_fc2com(getRelease_fc2com(htmlcode2)),
'outline': '', # getOutline_fc2com(htmlcode2),
'runtime': '',
'director': getStudio_fc2com(htmlcode2),
@@ -103,7 +103,7 @@ def main(number):
'series': '',
}
except Exception as e:
- if ADC_function.config.Config().debug():
+ if ADC_function.config.getInstance().debug():
print(e)
dic = {"title": ""}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py
index 7446ef3fe..786605274 100644
--- a/WebCrawler/javbus.py
+++ b/WebCrawler/javbus.py
@@ -1,114 +1,76 @@
import sys
sys.path.append('../')
import re
-from pyquery import PyQuery as pq#need install
from lxml import etree#need install
-from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
-from WebCrawler import fanza
-from WebCrawler import airav
+from WebCrawler.storyline import getStoryline
-def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
- soup = BeautifulSoup(htmlcode, 'lxml')
- a = soup.find_all(attrs={'class': 'star-name'})
+def getActorPhoto(html):
+ actors = html.xpath('//div[@class="star-name"]/a')
d={}
- for i in a:
- l=i.a['href']
- t=i.get_text()
- html = etree.fromstring(get_html(l), etree.HTMLParser())
+ for i in actors:
+ url=i.attrib['href']
+ t=i.attrib['title']
+ html = etree.fromstring(get_html(url), etree.HTMLParser())
p=urljoin("https://www.javbus.com",
str(html.xpath('//*[@id="waterfall"]/div[1]/div/div[1]/img/@src')).strip(" ['']"))
p2={t:p}
d.update(p2)
return d
-def getTitle(htmlcode): #获取标题
- doc = pq(htmlcode)
- title=str(doc('div.container h3').text()).replace(' ','-')
- try:
- title2 = re.sub('n\d+-','',title)
- return title2
- except:
- return title
-def getStudio(htmlcode): #获取厂商 已修改
- html = etree.fromstring(htmlcode,etree.HTMLParser())
- # 如果记录中冇导演,厂商排在第4位
- if '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
- # 如果记录中有导演,厂商排在第5位
- elif '製作商:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']")
- else:
- result = ''
- return result
-def getYear(htmlcode): #获取年份
- html = etree.fromstring(htmlcode,etree.HTMLParser())
+def getTitle(html): #获取标题
+ title = str(html.xpath('/html/head/title/text()')[0])
+ title = str(re.findall('^.+?\s+(.*) - JavBus$', title)[0]).strip()
+ return title
+def getStudioJa(html):
+ x = html.xpath('//span[contains(text(),"メーカー:")]/../a/text()')
+ return str(x[0]) if len(x) else ''
+def getStudio(html): #获取厂商
+ x = html.xpath('//span[contains(text(),"製作商:")]/../a/text()')
+ return str(x[0]) if len(x) else ''
+def getYear(html): #获取年份
+ result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']").strip()
+ return result[:4] if len(result)>=len('2000-01-01') else ''
+def getCover(html): #获取封面链接
+ image = str(html.xpath('//a[@class="bigImage"]/@href')[0])
+ return urljoin("https://www.javbus.com", image)
+def getRelease(html): #获取出版日期
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
return result
-def getCover(htmlcode): #获取封面链接
- doc = pq(htmlcode)
- image = doc('a.bigImage')
- return urljoin("https://www.javbus.com", image.attr('href'))
-def getRelease(htmlcode): #获取出版日期
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']")
- return result
-def getRuntime(htmlcode): #获取分钟 已修改
- html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getRuntime(html): #获取分钟 已修改
result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[3]/text()')).strip(" ['']分鐘")
return result
-def getActor(htmlcode): #获取女优
+def getActor(html): #获取女优
b=[]
- soup=BeautifulSoup(htmlcode,'lxml')
- a=soup.find_all(attrs={'class':'star-name'})
- for i in a:
- b.append(i.get_text())
+ actors = html.xpath('//div[@class="star-name"]/a')
+ for i in actors:
+ b.append(i.attrib['title'])
return b
-def getNum(htmlcode): #获取番号
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']")
- return result
-def getDirector(htmlcode): #获取导演 已修改
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- if '導演:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']")
- else:
- result = '' # 记录中有可能没有导演数据
- return result
-def getCID(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- #print(htmlcode)
+def getNum(html): #获取番号
+ kwdlist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+ return kwdlist[0]
+def getDirectorJa(html):
+ x = html.xpath('//span[contains(text(),"監督:")]/../a/text()')
+ return str(x[0]) if len(x) else ''
+def getDirector(html): #获取导演
+ x = html.xpath('//span[contains(text(),"導演:")]/../a/text()')
+ return str(x[0]) if len(x) else ''
+def getCID(html):
string = html.xpath("//a[contains(@class,'sample-box')][1]/@href")[0].replace('https://pics.dmm.co.jp/digital/video/','')
result = re.sub('/.*?.jpg','',string)
return result
-def getOutline(number): #获取剧情介绍
- try:
- response = json.loads(airav.main(number))
- result = response['outline']
- return result
- except:
- return ''
-def getSerise(htmlcode): #获取系列 已修改
- html = etree.fromstring(htmlcode, etree.HTMLParser())
- # 如果记录中冇导演,系列排在第6位
- if '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[6]/a/text()')).strip(" ['']")
- # 如果记录中有导演,系列排在第7位
- elif '系列:' == str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/span/text()')).strip(" ['']"):
- result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[7]/a/text()')).strip(" ['']")
- else:
- result = ''
- return result
-def getTag(htmlcode): # 获取标签
- tag = []
- soup = BeautifulSoup(htmlcode, 'lxml')
- a = soup.find_all(attrs={'class': 'genre'})
- for i in a:
- if 'onmouseout' in str(i) or '多選提交' in str(i):
- continue
- tag.append(translateTag_to_sc(i.get_text()))
- return tag
-
+def getOutline(number, title): #获取剧情介绍 多进程并发查询
+ return getStoryline(number,title)
+def getSeriseJa(html):
+ x = html.xpath('//span[contains(text(),"シリーズ:")]/../a/text()')
+ return str(x[0]) if len(x) else ''
+def getSerise(html): #获取系列
+ x = html.xpath('//span[contains(text(),"系列:")]/../a/text()')
+ return str(x[0]) if len(x) else ''
+def getTag(html): # 获取标签
+ klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',')
+ taglist = [translateTag_to_sc(v) for v in klist[1:]]
+ return taglist
def getExtrafanart(htmlcode): # 获取剧照
html_pather = re.compile(r'[\s\S]*?
\s*?')
html = html_pather.search(htmlcode)
@@ -117,32 +79,34 @@ def getExtrafanart(htmlcode): # 获取剧照
extrafanart_pather = re.compile(r'404 Page Not Found" in htmlcode:
+ raise Exception('404 page not found')
+ lx = etree.fromstring(htmlcode, etree.HTMLParser())
+ title = getTitle(lx)
dic = {
- 'title': str(re.sub('\w+-\d+-','',getTitle(htmlcode))).replace(getNum(htmlcode)+'-',''),
- 'studio': getStudio(htmlcode),
- 'year': getYear(htmlcode),
- 'outline': getOutline(number),
- 'runtime': getRuntime(htmlcode),
- 'director': getDirector(htmlcode),
- 'actor': getActor(htmlcode),
- 'release': getRelease(htmlcode),
- 'number': getNum(htmlcode),
- 'cover': getCover(htmlcode),
- 'tag': getTag(htmlcode),
+ 'title': title,
+ 'studio': getStudioJa(lx),
+ 'year': getYear(lx),
+ 'outline': getOutline(number, title),
+ 'runtime': getRuntime(lx),
+ 'director': getDirectorJa(lx),
+ 'actor': getActor(lx),
+ 'release': getRelease(lx),
+ 'number': getNum(lx),
+ 'cover': getCover(lx),
+ 'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
- 'label': getSerise(htmlcode),
+ 'label': getSeriseJa(lx),
'imagecut': 0,
- 'actor_photo': '',
+# 'actor_photo': '',
'website': 'https://www.javbus.com/ja/' + number,
'source': 'javbus.py',
- 'series': getSerise(htmlcode),
+ 'series': getSeriseJa(lx),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
@@ -155,32 +119,36 @@ def main(number):
htmlcode = get_html('https://www.fanbus.us/' + number)
except:
htmlcode = get_html('https://www.javbus.com/' + number)
+ if "404 Page Not Found" in htmlcode:
+ raise Exception('404 page not found')
+ lx = etree.fromstring(htmlcode,etree.HTMLParser())
+ title = getTitle(lx)
dic = {
- 'title': str(re.sub('\w+-\d+-', '', getTitle(htmlcode))),
- 'studio': getStudio(htmlcode),
- 'year': str(re.search('\d{4}', getYear(htmlcode)).group()),
- 'outline': getOutline(number),
- 'runtime': getRuntime(htmlcode),
- 'director': getDirector(htmlcode),
- 'actor': getActor(htmlcode),
- 'release': getRelease(htmlcode),
- 'number': getNum(htmlcode),
- 'cover': getCover(htmlcode),
+ 'title': title,
+ 'studio': getStudio(lx),
+ 'year': getYear(lx),
+ 'outline': getOutline(number, title),
+ 'runtime': getRuntime(lx),
+ 'director': getDirector(lx),
+ 'actor': getActor(lx),
+ 'release': getRelease(lx),
+ 'number': getNum(lx),
+ 'cover': getCover(lx),
'imagecut': 1,
- 'tag': getTag(htmlcode),
+ 'tag': getTag(lx),
'extrafanart': getExtrafanart(htmlcode),
- 'label': getSerise(htmlcode),
- 'actor_photo': getActorPhoto(htmlcode),
+ 'label': getSerise(lx),
+# 'actor_photo': getActorPhoto(lx),
'website': 'https://www.javbus.com/' + number,
'source': 'javbus.py',
- 'series': getSerise(htmlcode),
+ 'series': getSerise(lx),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4,separators=(',', ':'), ) # .encode('UTF-8')
return js
except:
return main_uncensored(number)
except Exception as e:
- if config.Config().debug():
+ if config.getInstance().debug():
print(e)
data = {
"title": "",
@@ -191,5 +159,13 @@ def main(number):
return js
if __name__ == "__main__" :
+ config.G_conf_override['debug_mode:switch'] = True
+ print(main('ABP-888'))
+ print(main('ABP-960'))
+ print(main('ADV-R0624')) # 404
+ print(main('MMNT-010'))
print(main('ipx-292'))
print(main('CEMD-011'))
+ print(main('CJOD-278'))
+ print(main('100221_001'))
+ print(main('AVSW-061'))
diff --git a/WebCrawler/javdb.py b/WebCrawler/javdb.py
index ecc4f3637..e4e803c2d 100755
--- a/WebCrawler/javdb.py
+++ b/WebCrawler/javdb.py
@@ -3,25 +3,22 @@
import re
from lxml import etree
import json
-from bs4 import BeautifulSoup
from ADC_function import *
-from WebCrawler import airav
-# import sys
+from mechanicalsoup.stateful_browser import StatefulBrowser
+from WebCrawler.storyline import getStoryline
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
-def getTitle(a):
- html = etree.fromstring(a, etree.HTMLParser())
+def getTitle(html):
browser_title = str(html.xpath("/html/head/title/text()")[0])
return browser_title[:browser_title.find(' | JavDB')].strip()
-def getActor(a):
- html = etree.fromstring(a, etree.HTMLParser())
+def getActor(html):
actors = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/text()')
genders = html.xpath('//span[@class="value"]/a[contains(@href,"/actors/")]/../strong/@class')
r = []
idx = 0
- actor_gendor = config.Config().actor_gender()
+ actor_gendor = config.getInstance().actor_gender()
if not actor_gendor in ['female','male','both','all']:
actor_gendor = 'female'
for act in actors:
@@ -33,8 +30,8 @@ def getActor(a):
idx = idx + 1
return r
-def getaphoto(url):
- html_page = get_html(url)
+def getaphoto(url, browser):
+ html_page = browser.open_relative(url).text if isinstance(browser, StatefulBrowser) else get_html(url)
img_prether = re.compile(r'演員\:\s*?.*?(.*)\s*?')
- actorall = actorall_prether.findall(html)
-
- if actorall:
- actoralls = actorall[0]
- actor_prether = re.compile(r'(.*?)')
- actor = actor_prether.findall(actoralls)
- actor_photo = {}
- for i in actor:
- actor_photo[i[1]] = getaphoto('https://' + javdb_site + '.com'+i[0])
-
- return actor_photo
-
- else:
+def getActorPhoto(html, javdb_site, browser): #//*[@id="star_qdt"]/li/a/img
+ actorall = html.xpath('//strong[contains(text(),"演員:")]/../span/a[starts-with(@href,"/actors/")]')
+ if not actorall:
return {}
+ a = getActor(html)
+ actor_photo = {}
+ for i in actorall:
+ if i.text in a:
+ actor_photo[i.text] = getaphoto(urljoin(f'https://{javdb_site}.com', i.attrib['href']), browser)
+ return actor_photo
-def getStudio(a):
+def getStudio(a, html):
# html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
# result1 = str(html.xpath('//strong[contains(text(),"片商")]/../span/text()')).strip(" ['']")
# result2 = str(html.xpath('//strong[contains(text(),"片商")]/../span/a/text()')).strip(" ['']")
@@ -67,23 +58,25 @@ def getStudio(a):
patherr = re.compile(r'片商\:[\s\S]*?(.*?)')
pianshang = patherr.findall(a)
if pianshang:
- result = pianshang[0]
- else:
- result = ""
+ result = pianshang[0].strip()
+ if len(result):
+ return result
+ # 以卖家作为工作室
+ try:
+ result = str(html.xpath('//strong[contains(text(),"賣家:")]/../span/a/text()')).strip(" ['']")
+ except:
+ result = ''
return result
-def getRuntime(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getRuntime(html):
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
-def getLabel(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getLabel(html):
result1 = str(html.xpath('//strong[contains(text(),"系列")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"系列")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').replace("', '", '').replace('"', '')
-def getNum(a):
- html = etree.fromstring(a, etree.HTMLParser())
+def getNum(html):
result1 = str(html.xpath('//strong[contains(text(),"番號")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"番號")]/../span/a/text()')).strip(" ['']")
return str(result2 + result1).strip('+')
@@ -113,8 +106,7 @@ def getRelease(a):
else:
result = ''
return result
-def getTag(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getTag(html):
try:
result = html.xpath('//strong[contains(text(),"類別")]/../span/a/text()')
total = []
@@ -135,11 +127,10 @@ def getTag(a):
pass
return total
-def getCover_small(a, index=0):
+def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
@@ -170,66 +161,76 @@ def getTrailer(htmlcode): # 获取预告片
video_url = ''
return video_url
-def getExtrafanart(htmlcode): # 获取剧照
- html_pather = re.compile(r'[\s\S]*?\s+?
\s+?')
- html = html_pather.search(htmlcode)
- if html:
- html = html.group()
- extrafanart_pather = re.compile(r' ul > li:nth-child(1) > div')
+ if number_up not in avs.select_one('a > h3').text.upper():
+ raise ValueError("number not found")
+ detail_url = avs.select_one('a')['href']
+ res = browser.open_relative(detail_url)
+ if not res.ok:
+ raise ValueError(f"browser.open_relative('{detail_url}') failed")
+ t = browser.page.select_one('head > title').text
+ airav_number = str(re.findall(r'^\s*\[(.*?)]', t)[0]).upper()
+ if number.upper() != airav_number:
+ raise ValueError(f"page number ->[{airav_number}] not match")
+ desc = browser.page.select_one('li.introduction > span').text.strip()
+ return desc
+ except Exception as e:
+ if debug:
+ print(f"[-]MP getOutline_amazon Error: {e},number [{number}].")
+ pass
+ return None
+
+
+def getStoryline_58avgo(number, debug):
+ try:
+ url = 'http://58avgo.com/cn/index.aspx' + secrets.choice([
+ '', '?status=3', '?status=4', '?status=7', '?status=9', '?status=10', '?status=11', '?status=12',
+ '?status=1&Sort=Playon', '?status=1&Sort=dateupload', 'status=1&Sort=dateproduce'
+ ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一
+ kwd = number[:6] if re.match(r'\d{6}[\-_]\d{2,3}', number) else number
+ result, browser = get_html_by_form(url,
+ fields = {'ctl00$TextBox_SearchKeyWord' : kwd},
+ return_type = 'browser')
+ if not result.ok:
+ raise ValueError(f"get_html_by_form('{url}','{number}') failed")
+ if f'searchresults.aspx?Search={kwd}' not in browser.url:
+ raise ValueError("number not found")
+ s = browser.page.select('div.resultcontent > ul > li.listItem > div.one-info-panel.one > a.ga_click')
+ link = None
+ for i in range(len(s)):
+ title = s[i].h3.text.strip()
+ if re.search(number, title, re.I):
+ link = s[i]
+ break;
+ if link is None:
+ raise ValueError("number not found")
+ result = browser.follow_link(link)
+ if not result.ok or 'playon.aspx' not in browser.url:
+ raise ValueError("detail page not found")
+ title = browser.page.select('head > title')[0].text.strip()
+ detail_number = str(re.findall('\[(.*?)]', title)[0])
+ if not re.search(number, detail_number, re.I):
+ raise ValueError("detail page number not match, got ->[{detail_number}]")
+ return browser.page.select('#ContentPlaceHolder1_Label2')[0].text.strip()
+ except Exception as e:
+ if debug:
+ print(f"[-]MP getOutline_58avgo Error: {e}, number [{number}].")
+ pass
+ return ''
+
+
+def getStoryline_avno1(number, debug): #获取剧情介绍 从avno1.cc取得
+ try:
+ url = 'http://www.avno1.cc/cn/' + secrets.choice(['usercenter.php?item=' +
+ secrets.choice(['pay_support', 'qa', 'contact', 'guide-vpn']),
+ '?top=1&cat=hd', '?top=1', '?cat=hd', 'porn', '?cat=jp', '?cat=us', 'recommend_category.php'
+ ]) # 随机选一个,避免网站httpd日志中单个ip的请求太过单一
+ result, browser = get_html_by_form(url,
+ form_select='div.wrapper > div.header > div.search > form',
+ fields = {'kw' : number},
+ return_type = 'browser')
+ if not result.ok:
+ raise ValueError(f"get_html_by_form('{url}','{number}') failed")
+ s = browser.page.select('div.type_movie > div > ul > li > div')
+ for i in range(len(s)):
+ title = s[i].a.h3.text.strip()
+ page_number = title[title.rfind(' '):].strip()
+ if re.search(number, page_number, re.I):
+ return s[i]['data-description'].strip()
+ raise ValueError(f"page number ->[{page_number}] not match")
+ except Exception as e:
+ if debug:
+ print(f"[-]MP getOutline_avno1 Error: {e}, number [{number}].")
+ pass
+ return ''
+
+
+def getStoryline_xcity(number, debug): #获取剧情介绍 从xcity取得
+ try:
+ xcity_number = number.replace('-','')
+ query_result, browser = get_html_by_form(
+ 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
+ fields = {'q' : xcity_number.lower()},
+ return_type = 'browser')
+ if not query_result or not query_result.ok:
+ raise ValueError("page not found")
+ result = browser.follow_link(browser.links('avod\/detail')[0])
+ if not result.ok:
+ raise ValueError("detail page not found")
+ return browser.page.select_one('h2.title-detail + p.lead').text.strip()
+ except Exception as e:
+ if debug:
+ print(f"[-]MP getOutline_xcity Error: {e}, number [{number}].")
+ pass
+ return ''
+
+
+def getStoryline_amazon(q_title, number, debug):
+ if not isinstance(q_title, str) or not len(q_title):
+ return None
+ try:
+ amazon_cookie, _ = load_cookies('amazon.json')
+ cookie = amazon_cookie if isinstance(amazon_cookie, dict) else None
+ url = "https://www.amazon.co.jp/s?k=" + q_title
+ res, browser = get_html_by_browser(url, cookies=cookie, return_type='browser')
+ if not res.ok:
+ raise ValueError("get_html_by_browser() failed")
+ lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
+ if isinstance(lks, list) and len(lks):
+ browser.follow_link(lks[0])
+ cookie = None
+ html = etree.fromstring(str(browser.page), etree.HTMLParser())
+ titles = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/text()")
+ urls = html.xpath("//span[contains(@class,'a-color-base a-text-normal')]/../@href")
+ if not len(urls) or len(urls) != len(titles):
+ raise ValueError("titles not found")
+ idx = amazon_select_one(titles, q_title, number, debug)
+ if not isinstance(idx, int) or idx < 0:
+ raise ValueError("title and number not found")
+ furl = urls[idx]
+ r = browser.open_relative(furl)
+ if not r.ok:
+ raise ValueError("browser.open_relative()) failed.")
+ lks = browser.links(r'/black-curtain/save-eligibility/black-curtain')
+ if isinstance(lks, list) and len(lks):
+ browser.follow_link(lks[0])
+ cookie = None
+
+ ama_t = browser.page.select_one('#productDescription > p').text.replace('\n',' ').strip()
+ ama_t = re.sub(r'審査番号:\d+', '', ama_t)
+
+ if cookie is None:
+ # 自动创建的cookies文件放在搜索路径表的末端,最低优先级。有amazon.co.jp帐号的用户可以从浏览器导出cookie放在靠前搜索路径
+ ama_save = Path.home() / ".local/share/avdc/amazon.json"
+ ama_save.parent.mkdir(parents=True, exist_ok=True)
+ ama_save.write_text(json.dumps(browser.session.cookies.get_dict(), sort_keys=True, indent=4), encoding='utf-8')
+
+ return ama_t
+
+ except Exception as e:
+ if debug:
+ print(f'[-]MP getOutline_amazon Error: {e}, number [{number}], title: {q_title}')
+ pass
+ return None
+
+# 查货架中DVD和蓝光商品中标题相似度高的
+def amazon_select_one(a_titles, q_title, number, debug):
+ sel = -1
+ ratio = 0
+ que_t = ''.join(c for c in q_title if not re.match(r'(P|S|Z).*', category(c), re.A))
+ for loc in range(len(a_titles)):
+ t = a_titles[loc]
+ if re.search(number, t, re.I): # 基本不带番号,但也有极个别有的,找到番号相同的直接通过
+ return loc
+ if not re.search('DVD|Blu-ray', t, re.I):
+ continue
+ ama_t = str(re.sub('DVD|Blu-ray', "", t, re.I))
+ ama_t = ''.join(c for c in ama_t if not re.match(r'(P|S|Z).*', category(c), re.A))
+ findlen = 0
+ lastpos = -1
+ cnt = len(ama_t)
+ for c in reversed(ama_t):
+ cnt -= 1
+ pos = que_t.rfind(c)
+ if lastpos >= 0:
+ pos_near = que_t[:lastpos].rfind(c)
+ if pos_near < 0:
+ findlen = 0
+ lastpos = -1
+ ama_t = ama_t[:cnt+1]
+ else:
+ pos = pos_near
+ if pos < 0:
+ if category(c) == 'Nd':
+ return -1
+ ama_t = ama_t[:cnt]
+ findlen = 0
+ lastpos = -1
+ continue
+ if findlen > 0 and len(que_t) > 1 and lastpos == pos+1:
+ findlen += 1
+ lastpos = pos
+ if findlen >= 4:
+ break
+ continue
+ findlen = 1
+ lastpos = pos
+ if findlen==0:
+ return -1
+ r = SequenceMatcher(None, ama_t, que_t).ratio()
+ if r > ratio:
+ sel = loc
+ ratio = r
+ save_t_ = ama_t
+ if ratio > 0.999:
+ break
+
+ if ratio < 0.5:
+ return -1
+
+ if not debug:
+ # 目前采信相似度高于0.9的结果
+ return sel if ratio >= 0.9 else -1
+
+ # debug 模式下记录识别准确率日志
+ if ratio < 0.9:
+ # 相似度[0.5, 0.9)的淘汰结果单独记录日志
+ (Path.home() / '.avlogs/ratio0.5.txt').open('a', encoding='utf-8').write(
+ f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
+ return -1
+ # 被采信的结果日志
+ (Path.home() / '.avlogs/ratio.txt').open('a', encoding='utf-8').write(
+ f' [{number}] Ratio:{ratio}\n{a_titles[sel]}\n{q_title}\n{save_t_}\n{que_t}\n')
+ return sel
diff --git a/WebCrawler/xcity.py b/WebCrawler/xcity.py
index a7b4cffc2..ed381e75c 100644
--- a/WebCrawler/xcity.py
+++ b/WebCrawler/xcity.py
@@ -3,16 +3,12 @@
import re
from lxml import etree
import json
-from bs4 import BeautifulSoup
from ADC_function import *
-
-
-# import sys
+from WebCrawler.storyline import getStoryline
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
-def getTitle(a):
- html = etree.fromstring(a, etree.HTMLParser())
+def getTitle(html):
result = html.xpath('//*[@id="program_detail_title"]/text()')[0]
return result
@@ -43,8 +39,7 @@ def getActorPhoto(browser):
return o
-def getStudio(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getStudio(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[4]/a/span/text()')).strip(" ['']")
except:
@@ -52,20 +47,14 @@ def getStudio(a):
return result.strip('+').replace("', '", '').replace('"', '')
-def getRuntime(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getRuntime(html):
try:
- result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[3]/text()')[0]
+ x = html.xpath('//span[@class="koumoku" and text()="収録時間"]/../text()')[1].strip()
+ return x
except:
return ''
- try:
- return re.findall('\d+',result1)[0]
- except:
- return ''
-
-def getLabel(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getLabel(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[5]/a/span/text()')[0]
return result
@@ -73,8 +62,7 @@ def getLabel(a):
return ''
-def getNum(a):
- html = etree.fromstring(a, etree.HTMLParser())
+def getNum(html):
try:
result = html.xpath('//*[@id="hinban"]/text()')[0]
return result
@@ -90,8 +78,7 @@ def getYear(getRelease):
return getRelease
-def getRelease(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getRelease(html):
try:
result = str(html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[2]/text()')[1])
except:
@@ -102,31 +89,22 @@ def getRelease(a):
return ''
-def getTag(a):
- result2=[]
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
- result1 = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[1]/li[6]/a/text()')
- for i in result1:
- i=i.replace(u'\n','')
- i=i.replace(u'\t','')
- if len(i):
- result2.append(i)
- return result2
+def getTag(html):
+ x = html.xpath('//span[@class="koumoku" and text()="ジャンル"]/../a[starts-with(@href,"/avod/genre/")]/text()')
+ return [translateTag_to_sc(i.strip()) for i in x if len(i.strip())] if len(x) and len(x[0]) else []
-def getCover_small(a, index=0):
+def getCover_small(html, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
-def getCover(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getCover(html):
try:
result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[1]/p/a/@href')[0]
return 'https:' + result
@@ -134,8 +112,7 @@ def getCover(htmlcode):
return ''
-def getDirector(a):
- html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
+def getDirector(html):
try:
result = html.xpath('//*[@id="program_detail_director"]/text()')[0].replace(u'\n','').replace(u'\t', '')
return result
@@ -143,19 +120,21 @@ def getDirector(a):
return ''
-def getOutline(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getOutline(html, number, title):
+ storyline_site = config.getInstance().storyline_site().split(',')
+ a = set(storyline_site) & {'airav', 'avno1'} # 只要中文的简介文字
+ if len(a):
+ site = [n for n in storyline_site if n in a]
+ g = getStoryline(number, title, site)
+ if len(g):
+ return g
try:
- result = html.xpath('//*[@id="avodDetails"]/div/div[3]/div[2]/div/ul[2]/li[5]/p/text()')[0]
+ x = html.xpath('//h2[@class="title-detail"]/../p[@class="lead"]/text()')[0]
+ return x.replace(getNum(html), '')
except:
return ''
- try:
- return re.sub('\\\\\w*\d+','',result)
- except:
- return result
-def getSeries(htmlcode):
- html = etree.fromstring(htmlcode, etree.HTMLParser())
+def getSeries(html):
try:
try:
result = html.xpath("//span[contains(text(),'シリーズ')]/../a/span/text()")[0]
@@ -181,11 +160,10 @@ def getExtrafanart(htmlcode): # 获取剧照
return s
return ''
-def main(number):
- try:
+def open_by_browser(number):
xcity_number = number.replace('-','')
query_result, browser = get_html_by_form(
- 'https://xcity.jp/about/',
+ 'https://xcity.jp/' + secrets.choice(['about/','sitemap/','policy/','law/','help/','main/']),
fields = {'q' : xcity_number.lower()},
return_type = 'browser')
if not query_result or not query_result.ok:
@@ -193,38 +171,44 @@ def main(number):
result = browser.follow_link(browser.links('avod\/detail')[0])
if not result.ok:
raise ValueError("xcity.py: detail page not found")
- detail_page = str(browser.page)
+ return str(browser.page), browser
+
+def main(number):
+ try:
+ detail_page, browser = open_by_browser(number)
url = browser.url
- newnum = getNum(detail_page).upper()
+ lx = etree.fromstring(detail_page, etree.HTMLParser())
+ newnum = getNum(lx).upper()
number_up = number.upper()
if newnum != number_up:
- if newnum == xcity_number.upper():
+ if newnum == number.replace('-','').upper():
newnum = number_up
else:
raise ValueError("xcity.py: number not found")
+ title = getTitle(lx)
dic = {
'actor': getActor(browser),
- 'title': getTitle(detail_page),
- 'studio': getStudio(detail_page),
- 'outline': getOutline(detail_page),
- 'runtime': getRuntime(detail_page),
- 'director': getDirector(detail_page),
- 'release': getRelease(detail_page),
+ 'title': title,
+ 'studio': getStudio(lx),
+ 'outline': getOutline(lx, number, title),
+ 'runtime': getRuntime(lx),
+ 'director': getDirector(lx),
+ 'release': getRelease(lx),
'number': newnum,
- 'cover': getCover(detail_page),
+ 'cover': getCover(lx),
'cover_small': '',
'extrafanart': getExtrafanart(detail_page),
'imagecut': 1,
- 'tag': getTag(detail_page),
- 'label': getLabel(detail_page),
- 'year': getYear(getRelease(detail_page)), # str(re.search('\d{4}',getRelease(a)).group()),
+ 'tag': getTag(lx),
+ 'label': getLabel(lx),
+ 'year': getYear(getRelease(lx)), # str(re.search('\d{4}',getRelease(a)).group()),
# 'actor_photo': getActorPhoto(browser),
'website': url,
'source': 'xcity.py',
- 'series': getSeries(detail_page),
+ 'series': getSeries(lx),
}
except Exception as e:
- if config.Config().debug():
+ if config.getInstance().debug():
print(e)
dic = {"title": ""}
diff --git a/config.ini b/config.ini
index 58e6892cb..eef14db54 100755
--- a/config.ini
+++ b/config.ini
@@ -1,12 +1,13 @@
[common]
main_mode=1
+source_folder=./
failed_output_folder=failed
success_output_folder=JAV_output
soft_link=0
failed_move=1
auto_exit=0
transalte_to_sc=0
-multi_threading=1
+multi_threading=0
;actor_gender value: female(♀) or male(♂) or both(♀ ♂) or all(♂ ♀ ⚧)
actor_gender=female
del_empty_folder=1
@@ -16,6 +17,8 @@ nfo_skip_days=30
; 处理完多少个视频文件后停止,0为处理所有视频文件
stop_counter=0
; 以上两个参数配合使用可以以多次少量的方式刮削或整理数千个文件而不触发翻译或元数据站封禁
+ignore_failed_list=0
+download_only_missing_images=1
[proxy]
;proxytype: http or socks5 or socks5h switch: 0 1
@@ -62,8 +65,7 @@ switch=0
; 用来确定是否是无码
[uncensored]
-uncensored_prefix=S2M,BT,LAF,SMD
-
+uncensored_prefix=S2M,BT,LAF,SMD,SMBD,SM3D2DBD,SKY-,SKYHD,CWP,CWDV,CWBD,CW3D2DBD,MKD,MKBD,MXBD,MK3D2DBD,MCB3DBD,MCBD,RHJ,RED
[media]
; 影片后缀
@@ -82,3 +84,20 @@ water=2
switch=0
extrafanart_folder=extrafanart
+; 剧情简介
+[storyline]
+; website为javbus javdb avsox xcity carib时,site censored_site uncensored_site 为获取剧情简介信息的
+; 可选数据源站点列表。列表内站点同时并发查询,取值优先级从左到右,靠左站点没数据才会采用后面站点获得的。
+; 其中airav avno1 58avgo是中文剧情简介,区别是airav只能查有码,avno1有码无码都能查,58avgo只能查无码或者
+; 流出破解马赛克的影片(此功能没使用)。
+; xcity和amazon是日语的,由于amazon商城没有番号信息,选中对应DVD的准确率仅99.6%。如果三个列表全部为空则不查询,
+; 设置成不查询可大幅提高刮削速度。
+; site=
+site=avno1
+censored_site=airav,xcity,amazon
+uncensored_site=58avgo
+; 运行模式:0:顺序执行(最慢) 1:线程池(默认值) 2:进程池(启动开销比线程池大,并发站点越多越快)
+run_mode=1
+; show_result剧情简介调试信息 0关闭 1简略 2详细(详细部分不记入日志),剧情简介失效时可打开2查看原因
+show_result=0
+
diff --git a/config.py b/config.py
index 82fd34573..f6d6488ef 100644
--- a/config.py
+++ b/config.py
@@ -1,33 +1,82 @@
import os
+import re
import sys
import configparser
-import codecs
from pathlib import Path
+
+G_conf_override = {
+ # index 0 save Config() first instance for quick access by using getInstance()
+ 0 : None,
+ # register override config items
+ "common:main_mode" : None,
+ "common:source_folder" : None,
+ "common:auto_exit" : None,
+ "common:nfo_skip_days" : None,
+ "common:stop_counter" : None,
+ "common:ignore_failed_list" : None,
+ "debug_mode:switch" : None
+}
+
+
+def getInstance():
+ if isinstance(G_conf_override[0], Config):
+ return G_conf_override[0]
+ return Config()
+
+
class Config:
def __init__(self, path: str = "config.ini"):
- path_search_order = [
- path,
- "./config.ini",
- os.path.join(Path.home(), "avdc.ini"),
- os.path.join(Path.home(), ".avdc.ini"),
- os.path.join(Path.home(), ".avdc/config.ini"),
- os.path.join(Path.home(), ".config/avdc/config.ini")
- ]
+ path_search_order = (
+ Path(path),
+ Path.cwd() / "config.ini",
+ Path.home() / "avdc.ini",
+ Path.home() / ".avdc.ini",
+ Path.home() / ".avdc/config.ini",
+ Path.home() / ".config/avdc/config.ini"
+ )
ini_path = None
for p in path_search_order:
- if os.path.isfile(p):
- ini_path = p
+ if p.is_file():
+ ini_path = p.resolve()
break
if ini_path:
self.conf = configparser.ConfigParser()
+ self.ini_path = ini_path
try:
- self.conf.read(ini_path, encoding="utf-8-sig")
+ if self.conf.read(ini_path, encoding="utf-8-sig"):
+ if G_conf_override[0] is None:
+ G_conf_override[0] = self
except:
- self.conf.read(ini_path, encoding="utf-8")
+ if self.conf.read(ini_path, encoding="utf-8"):
+ if G_conf_override[0] is None:
+ G_conf_override[0] = self
else:
- print("[-]Config file not found!")
- sys.exit(2)
+ print("ERROR: Config file not found!")
+ print("Please put config file into one of the following path:")
+ print('\n'.join([str(p.resolve()) for p in path_search_order[2:]]))
+ # 对于找不到配置文件的情况,还是在打包时附上对应版本的默认配置文件,有需要时为其在搜索路径中生成,
+ # 要比用户乱找一个版本不对应的配置文件会可靠些。这样一来,单个执行文件就是功能完整的了,放在任何
+ # 执行路径下都可以放心使用。
+ res_path = None
+ # pyinstaller打包的在打包中找config.ini
+ if hasattr(sys, '_MEIPASS') and (Path(getattr(sys, '_MEIPASS')) / 'config.ini').is_file():
+ res_path = Path(getattr(sys, '_MEIPASS')) / 'config.ini'
+ # 脚本运行的所在位置找
+ elif (Path(__file__).resolve().parent / 'config.ini').is_file():
+ res_path = Path(__file__).resolve().parent / 'config.ini'
+ if res_path is None:
+ sys.exit(2)
+ ins = input("Or, Do you want me create a config file for you? (Yes/No)[Y]:")
+ if re.search('n', ins, re.I):
+ sys.exit(2)
+ # 用户目录才确定具有写权限,因此选择 ~/avdc.ini 作为配置文件生成路径,而不是有可能并没有写权限的
+ # 当前目录。目前版本也不再鼓励使用当前路径放置配置文件了,只是作为多配置文件的切换技巧保留。
+ write_path = path_search_order[2] # Path.home() / "avdc.ini"
+ write_path.write_text(res_path.read_text(encoding='utf-8'), encoding='utf-8')
+ print("Config file '{}' created.".format(write_path.resolve()))
+ input("Press Enter key exit...")
+ sys.exit(0)
# self.conf = self._default_config()
# try:
# self.conf = configparser.ConfigParser()
@@ -40,13 +89,24 @@ def __init__(self, path: str = "config.ini"):
# print("[-]",e)
# sys.exit(3)
# #self.conf = self._default_config()
+ def getboolean_override(self, section, item) -> bool:
+ return self.conf.getboolean(section, item) if G_conf_override[f"{section}:{item}"] is None else bool(G_conf_override[f"{section}:{item}"])
+
+ def getint_override(self, section, item) -> int:
+ return self.conf.getint(section, item) if G_conf_override[f"{section}:{item}"] is None else int(G_conf_override[f"{section}:{item}"])
+
+ def get_override(self, section, item) -> str:
+ return self.conf.get(section, item) if G_conf_override[f"{section}:{item}"] is None else str(G_conf_override[f"{section}:{item}"])
- def main_mode(self) -> str:
+ def main_mode(self) -> int:
try:
- return self.conf.getint("common", "main_mode")
+ return self.getint_override("common", "main_mode")
except ValueError:
self._exit("common:main_mode")
+ def source_folder(self) -> str:
+ return self.get_override("common", "source_folder")
+
def failed_folder(self) -> str:
return self.conf.get("common", "failed_output_folder")
@@ -61,7 +121,7 @@ def soft_link(self) -> bool:
def failed_move(self) -> bool:
return self.conf.getboolean("common", "failed_move")
def auto_exit(self) -> bool:
- return self.conf.getboolean("common", "auto_exit")
+ return self.getboolean_override("common", "auto_exit")
def transalte_to_sc(self) -> bool:
return self.conf.getboolean("common", "transalte_to_sc")
def multi_threading(self) -> bool:
@@ -70,14 +130,18 @@ def del_empty_folder(self) -> bool:
return self.conf.getboolean("common", "del_empty_folder")
def nfo_skip_days(self) -> int:
try:
- return self.conf.getint("common", "nfo_skip_days")
+ return self.getint_override("common", "nfo_skip_days")
except:
return 30
def stop_counter(self) -> int:
try:
- return self.conf.getint("common", "stop_counter")
+ return self.getint_override("common", "stop_counter")
except:
return 0
+ def ignore_failed_list(self) -> bool:
+ return self.getboolean_override("common", "ignore_failed_list")
+ def download_only_missing_images(self) -> bool:
+ return self.conf.getboolean("common", "download_only_missing_images")
def is_transalte(self) -> bool:
return self.conf.getboolean("transalte", "switch")
def is_trailer(self) -> bool:
@@ -173,7 +237,39 @@ def escape_folder(self) -> str:
return self.conf.get("escape", "folders")
def debug(self) -> bool:
- return self.conf.getboolean("debug_mode", "switch")
+ return self.getboolean_override("debug_mode", "switch")
+
+ def storyline_site(self) -> str:
+ try:
+ return self.conf.get("storyline", "site")
+ except:
+ return "avno1"
+
+ def storyline_censored_site(self) -> str:
+ try:
+ return self.conf.get("storyline", "censored_site")
+ except:
+ return "airav,xcity,amazon"
+
+ def storyline_uncensored_site(self) -> str:
+ try:
+ return self.conf.get("storyline", "uncensored_site")
+ except:
+ return "58avgo"
+
+ def storyline_show(self) -> int:
+ try:
+ v = self.conf.getint("storyline", "show_result")
+ return v if v in (0,1,2) else 2 if v > 2 else 0
+ except:
+ return 0
+
+ def storyline_mode(self) -> int:
+ try:
+ v = self.conf.getint("storyline", "run_mode")
+ return v if v in (0,1,2) else 2 if v > 2 else 0
+ except:
+ return 1
@staticmethod
def _exit(sec: str) -> None:
@@ -188,6 +284,7 @@ def _default_config() -> configparser.ConfigParser:
sec1 = "common"
conf.add_section(sec1)
conf.set(sec1, "main_mode", "1")
+ conf.set(sec1, "source_folder", "./")
conf.set(sec1, "failed_output_folder", "failed")
conf.set(sec1, "success_output_folder", "JAV_output")
conf.set(sec1, "soft_link", "0")
@@ -199,6 +296,8 @@ def _default_config() -> configparser.ConfigParser:
conf.set(sec1, "del_empty_folder", "1")
conf.set(sec1, "nfo_skip_days", 30)
conf.set(sec1, "stop_counter", 0)
+ conf.set(sec1, "ignore_failed_list", 0)
+ conf.set(sec1, "download_only_missing_images", 1)
sec2 = "proxy"
conf.add_section(sec2)
@@ -265,6 +364,14 @@ def _default_config() -> configparser.ConfigParser:
conf.set(sec13, "switch", 1)
conf.set(sec13, "extrafanart_folder", "extrafanart")
+ sec14 = "storyline"
+ conf.add_section(sec14)
+ conf.set(sec14, "site", "avno1")
+ conf.set(sec14, "censored_site", "airav,xcity,amazon")
+ conf.set(sec14, "uncensored_site", "58avgo")
+ conf.set(sec14, "show_result", 0)
+ conf.set(sec14, "run_mode", 1)
+
return conf
@@ -308,9 +415,45 @@ def evprint(evstr):
code = compile(evstr, "", "eval")
print('{}: "{}"'.format(evstr, eval(code)))
config = Config()
- mfilter = ('conf', 'proxy', '_exit', '_default_config')
+ mfilter = {'conf', 'proxy', '_exit', '_default_config', 'getboolean_override', 'getint_override', 'get_override', 'ini_path'}
for _m in [m for m in dir(config) if not m.startswith('__') and m not in mfilter]:
evprint(f'config.{_m}()')
- pfilter = ('proxies', 'SUPPORT_PROXY_TYPE')
- for _p in [p for p in dir(config.proxy()) if not p.startswith('__') and p not in pfilter]:
- evprint(f'config.proxy().{_p}')
+ pfilter = {'proxies', 'SUPPORT_PROXY_TYPE'}
+ # test getInstance()
+ assert(getInstance() == config)
+ for _p in [p for p in dir(getInstance().proxy()) if not p.startswith('__') and p not in pfilter]:
+ evprint(f'getInstance().proxy().{_p}')
+
+ # Override Test
+ G_conf_override["common:nfo_skip_days"] = 4321
+ G_conf_override["common:stop_counter"] = 1234
+ assert config.nfo_skip_days() == 4321
+ assert getInstance().stop_counter() == 1234
+ # remove override
+ G_conf_override["common:stop_counter"] = None
+ G_conf_override["common:nfo_skip_days"] = None
+ assert config.nfo_skip_days() != 4321
+ assert config.stop_counter() != 1234
+ # Create new instance
+ conf2 = Config()
+ assert getInstance() != conf2
+ assert getInstance() == config
+ G_conf_override["common:main_mode"] = 9
+ G_conf_override["common:source_folder"] = "A:/b/c"
+ # Override effect to all instances
+ assert config.main_mode() == 9
+ assert conf2.main_mode() == 9
+ assert getInstance().main_mode() == 9
+ assert conf2.source_folder() == "A:/b/c"
+ print("### Override Test ###".center(36))
+ evprint('getInstance().main_mode()')
+ evprint('config.source_folder()')
+ G_conf_override["common:main_mode"] = None
+ evprint('conf2.main_mode()')
+ evprint('config.main_mode()')
+ # unregister key acess will raise except
+ try:
+ print(G_conf_override["common:actor_gender"])
+ except KeyError as ke:
+ print(f'Catched KeyError: {ke} is not a register key of G_conf_override dict.', file=sys.stderr)
+ print(f"Load Config file '{conf2.ini_path}'.")
diff --git a/core.py b/core.py
index cb1a78238..24c1ce51c 100755
--- a/core.py
+++ b/core.py
@@ -3,8 +3,6 @@
import pathlib
import re
import shutil
-import platform
-import errno
import sys
from PIL import Image
@@ -14,7 +12,7 @@
from ADC_function import *
from WebCrawler import get_data_from_json
-
+from number_parser import is_uncensored
def escape_path(path, escape_literals: str): # Remove escape literals
backslash = '\\'
@@ -23,7 +21,8 @@ def escape_path(path, escape_literals: str): # Remove escape literals
return path
-def moveFailedFolder(filepath, conf):
+def moveFailedFolder(filepath):
+ conf = config.getInstance()
failed_folder = conf.failed_folder()
soft_link = conf.soft_link()
# 模式3或软连接,改为维护一个失败列表,启动扫描时加载用于排除该路径,以免反复处理
@@ -33,7 +32,6 @@ def moveFailedFolder(filepath, conf):
print("[-]Add to Failed List file, see '%s'" % ftxt)
with open(ftxt, 'a', encoding='utf-8') as flt:
flt.write(f'{filepath}\n')
- flt.close()
elif conf.failed_move() and not soft_link:
failed_name = os.path.join(failed_folder, os.path.basename(filepath))
mtxt = os.path.abspath(os.path.join(failed_folder, 'where_was_i_before_being_moved.txt'))
@@ -41,8 +39,13 @@ def moveFailedFolder(filepath, conf):
with open(mtxt, 'a', encoding='utf-8') as wwibbmt:
tmstr = datetime.now().strftime("%Y-%m-%d %H:%M")
wwibbmt.write(f'{tmstr} FROM[{filepath}]TO[{failed_name}]\n')
- wwibbmt.close()
- shutil.move(filepath, failed_name)
+ try:
+ if os.path.exists(failed_name):
+ print('[-]File Exists while moving to FailedFolder')
+ return
+ shutil.move(filepath, failed_name)
+ except:
+ print('[-]File Moving to FailedFolder unsuccessful!')
def get_info(json_data): # 返回json里的数据
@@ -63,14 +66,15 @@ def get_info(json_data): # 返回json里的数据
return title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label
-def small_cover_check(path, number, cover_small, leak_word, c_word, conf: config.Config, filepath):
+def small_cover_check(path, number, cover_small, leak_word, c_word, filepath):
filename = f"{number}{leak_word}{c_word}-poster.jpg"
- download_file_with_filename(cover_small, filename, path, conf, filepath)
+ download_file_with_filename(cover_small, filename, path, filepath)
print('[+]Image Downloaded! ' + os.path.join(path, filename))
-def create_folder(json_data, conf: config.Config): # 创建文件夹
+def create_folder(json_data): # 创建文件夹
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
+ conf = config.getInstance()
success_folder = conf.success_folder()
actor = json_data.get('actor')
location_rule = eval(conf.location_rule(), json_data)
@@ -81,35 +85,40 @@ def create_folder(json_data, conf: config.Config): # 创建文件夹
if 'title' in conf.location_rule() and len(title) > maxlen:
shorttitle = title[0:maxlen]
location_rule = location_rule.replace(title, shorttitle)
-
- path = os.path.join(success_folder, location_rule).strip()
- if not os.path.isdir(path):
+ # 当演员为空时,location_rule被计算为'/number'绝对路径,导致路径连接忽略第一个路径参数,因此添加./使其始终为相对路径
+ path = os.path.join(success_folder, f'./{location_rule.strip()}')
+ if not os.path.exists(path):
path = escape_path(path, conf.escape_literals())
try:
os.makedirs(path)
- if not os.path.isdir(path):
- raise
except:
path = success_folder + '/' + location_rule.replace('/[' + number + ')-' + title, "/number")
path = escape_path(path, conf.escape_literals())
+ try:
+ os.makedirs(path)
+ except:
+ print(f"[-]Fatal error! Can not make folder '{path}'")
+ sys.exit(0)
- os.makedirs(path)
- return path
+ return os.path.normpath(path)
# =====================资源下载部分===========================
# path = examle:photo , video.in the Project Folder!
-def download_file_with_filename(url, filename, path, conf: config.Config, filepath):
+def download_file_with_filename(url, filename, path, filepath):
+ conf = config.getInstance()
configProxy = conf.proxy()
for i in range(configProxy.retry):
try:
if configProxy.enable:
- if not os.path.isdir(path):
- os.makedirs(path)
- if not os.path.isdir(path):
- raise IOError
+ if not os.path.exists(path):
+ try:
+ os.makedirs(path)
+ except:
+ print(f"[-]Fatal error! Can not make folder '{path}'")
+ sys.exit(0)
proxies = configProxy.proxies()
headers = {
'User-Agent': G_USER_AGENT}
@@ -121,10 +130,12 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
code.write(r.content)
return
else:
- if not os.path.isdir(path):
- os.makedirs(path)
- if not os.path.isdir(path):
- raise IOError
+ if not os.path.exists(path):
+ try:
+ os.makedirs(path)
+ except:
+ print(f"[-]Fatal error! Can not make folder '{path}'")
+ sys.exit(0)
headers = {
'User-Agent': G_USER_AGENT}
r = requests.get(url, timeout=configProxy.timeout, headers=headers)
@@ -148,46 +159,50 @@ def download_file_with_filename(url, filename, path, conf: config.Config, filepa
print('[-]Image Download : Connect retry ' + str(i) + '/' + str(configProxy.retry))
except IOError:
print(f"[-]Create Directory '{path}' failed!")
- moveFailedFolder(filepath, conf)
+ moveFailedFolder(filepath)
return
print('[-]Connect Failed! Please check your Proxy or Network!')
- moveFailedFolder(filepath, conf)
+ moveFailedFolder(filepath)
return
-def trailer_download(trailer, leak_word, c_word, number, path, filepath, conf: config.Config):
- if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath) == 'failed':
+def trailer_download(trailer, leak_word, c_word, number, path, filepath):
+ if download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath) == 'failed':
return
- configProxy = conf.proxy()
+ configProxy = config.getInstance().proxy()
for i in range(configProxy.retry):
- if os.path.getsize(path+'/' + number + leak_word + c_word + '-trailer.mp4') == 0:
+ if file_not_exist_or_empty(path+'/' + number + leak_word + c_word + '-trailer.mp4'):
print('[!]Video Download Failed! Trying again. [{}/3]', i + 1)
- download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, conf, filepath)
+ download_file_with_filename(trailer, number + leak_word + c_word + '-trailer.mp4', path, filepath)
continue
else:
break
- if os.path.getsize(path + '/' + number + leak_word + c_word + '-trailer.mp4') == 0:
+ if file_not_exist_or_empty(path + '/' + number + leak_word + c_word + '-trailer.mp4'):
return
print('[+]Video Downloaded!', path + '/' + number + leak_word + c_word + '-trailer.mp4')
# 剧照下载成功,否则移动到failed
-def extrafanart_download(data, path, conf: config.Config, filepath):
+def extrafanart_download(data, path, filepath):
j = 1
+ conf = config.getInstance()
path = os.path.join(path, conf.get_extrafanart())
+ configProxy = conf.proxy()
+ download_only_missing_images = conf.download_only_missing_images()
for url in data:
jpg_filename = f'extrafanart-{j}.jpg'
jpg_fullpath = os.path.join(path, jpg_filename)
- if download_file_with_filename(url, jpg_filename, path, conf, filepath) == 'failed':
- moveFailedFolder(filepath, conf)
+ if download_only_missing_images and not file_not_exist_or_empty(jpg_fullpath):
+ continue
+ if download_file_with_filename(url, jpg_filename, path, filepath) == 'failed':
+ moveFailedFolder(filepath)
return
- configProxy = conf.proxy()
for i in range(configProxy.retry):
- if os.path.getsize(jpg_fullpath) == 0:
+ if file_not_exist_or_empty(jpg_fullpath):
print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
- download_file_with_filename(url, jpg_filename, path, conf, filepath)
+ download_file_with_filename(url, jpg_filename, path, filepath)
continue
else:
break
- if os.path.getsize(jpg_fullpath) == 0:
+ if file_not_exist_or_empty(jpg_fullpath):
return
print('[+]Image Downloaded!', jpg_fullpath)
j += 1
@@ -195,39 +210,46 @@ def extrafanart_download(data, path, conf: config.Config, filepath):
# 封面是否下载成功,否则移动到failed
-def image_download(cover, number, leak_word, c_word, path, conf: config.Config, filepath):
+def image_download(cover, number, leak_word, c_word, path, filepath):
filename = f"{number}{leak_word}{c_word}-fanart.jpg"
full_filepath = os.path.join(path, filename)
- if download_file_with_filename(cover, filename, path, conf, filepath) == 'failed':
- moveFailedFolder(filepath, conf)
+ if config.getInstance().download_only_missing_images() and not file_not_exist_or_empty(full_filepath):
+ return
+ if download_file_with_filename(cover, filename, path, filepath) == 'failed':
+ moveFailedFolder(filepath)
return
- configProxy = conf.proxy()
+ configProxy = config.getInstance().proxy()
for i in range(configProxy.retry):
- if os.path.getsize(full_filepath) == 0:
+ if file_not_exist_or_empty(full_filepath):
print('[!]Image Download Failed! Trying again. [{}/3]', i + 1)
- download_file_with_filename(cover, filename, path, conf, filepath)
+ download_file_with_filename(cover, filename, path, filepath)
continue
else:
break
- if os.path.getsize(full_filepath) == 0:
+ if file_not_exist_or_empty(full_filepath):
return
print('[+]Image Downloaded!', full_filepath)
shutil.copyfile(full_filepath, os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg"))
-def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored, conf):
+def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, filepath, tag, actor_list, liuchu, uncensored):
title, studio, year, outline, runtime, director, actor_photo, release, number, cover, trailer, website, series, label = get_info(json_data)
- failed_folder = conf.failed_folder()
- if conf.main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持
+ if config.getInstance().main_mode() == 3: # 模式3下,由于视频文件不做任何改变,.nfo文件必须和视频文件名称除后缀外完全一致,KODI等软件方可支持
nfo_path = str(Path(filepath).with_suffix('.nfo'))
else:
nfo_path = os.path.join(path,f"{number}{part}{leak_word}{c_word}.nfo")
try:
- if not os.path.isdir(path):
- os.makedirs(path)
- if not os.path.isdir(path):
- raise IOError
+ if not os.path.exists(path):
+ try:
+ os.makedirs(path)
+ except:
+ print(f"[-]Fatal error! can not make folder '{path}'")
+ sys.exit(0)
+
+ # KODI内查看影片信息时找不到number,配置naming_rule=number+'#'+title虽可解决
+ # 但使得标题太长,放入时常为空的outline内会更适合,软件给outline留出的显示版面也较大
+ outline = f"{number}#{outline}"
with open(nfo_path, "wt", encoding='UTF-8') as code:
print('', file=code)
print("", file=code)
@@ -279,7 +301,7 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
print(" " + number + "", file=code)
print(" " + release + "", file=code)
print(" " + cover + "", file=code)
- if config.Config().is_trailer():
+ if config.getInstance().is_trailer():
print(" " + trailer + "", file=code)
print(" " + website + "", file=code)
print("", file=code)
@@ -287,12 +309,12 @@ def print_files(path, leak_word, c_word, naming_rule, part, cn_sub, json_data, f
except IOError as e:
print("[-]Write Failed!")
print("[-]", e)
- moveFailedFolder(filepath, conf)
+ moveFailedFolder(filepath)
return
except Exception as e1:
print("[-]Write Failed!")
print("[-]", e1)
- moveFailedFolder(filepath, conf)
+ moveFailedFolder(filepath)
return
@@ -321,7 +343,7 @@ def cutImage(imagecut, path, number, leak_word, c_word):
# leak 流出 参数值为 1 0
# uncensored 无码 参数值为 1 0
# ========================================================================加水印
-def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Config):
+def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored):
mark_type = ''
if cn_sub:
mark_type += ',字幕'
@@ -331,17 +353,17 @@ def add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf:config.Conf
mark_type += ',无码'
if mark_type == '':
return
- add_mark_thread(thumb_path, cn_sub, leak, uncensored, conf)
+ add_mark_thread(thumb_path, cn_sub, leak, uncensored)
print('[+]Thumb Add Mark: ' + mark_type.strip(','))
- add_mark_thread(poster_path, cn_sub, leak, uncensored, conf)
+ add_mark_thread(poster_path, cn_sub, leak, uncensored)
print('[+]Poster Add Mark: ' + mark_type.strip(','))
-def add_mark_thread(pic_path, cn_sub, leak, uncensored, conf):
+def add_mark_thread(pic_path, cn_sub, leak, uncensored):
size = 14
img_pic = Image.open(pic_path)
# 获取自定义位置,取余配合pos达到顺时针添加的效果
# 左上 0, 右上 1, 右下 2, 左下 3
- count = conf.watermark_type()
+ count = config.getInstance().watermark_type()
if cn_sub == 1 or cn_sub == '1':
add_to_pic(pic_path, img_pic, size, count, 1) # 添加
count = (count + 1) % 4
@@ -391,29 +413,38 @@ def add_to_pic(pic_path, img_pic, size, count, mode):
img_pic.save(pic_path, quality=95)
# ========================结束=================================
-def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config.Config): # 文件路径,番号,后缀,要移动至的位置
+def paste_file_to_folder(filepath, path, number, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置
filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
try:
targetpath = os.path.join(path, f"{number}{leak_word}{c_word}{houzhui}")
+ # 任何情况下都不要覆盖,以免遭遇数据源或者引擎错误导致所有文件得到同一个number,逐一
+ # 同名覆盖致使全部文件损失且不可追回的最坏情况
+ if os.path.exists(targetpath):
+ raise FileExistsError('File Exists on destination path, we will never overwriting.')
+ soft_link = config.getInstance().soft_link()
# 如果soft_link=1 使用软链接
- if conf.soft_link() == 0:
+ if soft_link == 0:
shutil.move(filepath, targetpath)
- elif conf.soft_link() == 1:
- # 采用相对路径,以便网络访问时能正确打开视频
- filerelpath = os.path.relpath(filepath, path)
- os.symlink(filerelpath, targetpath)
- elif conf.soft_link() == 2:
+ elif soft_link == 1:
+ # 先尝试采用相对路径,以便网络访问时能正确打开视频,失败则可能是因为跨盘符等原因无法支持
+ # 相对路径径,改用绝对路径方式尝试建立软链接
+ try:
+ filerelpath = os.path.relpath(filepath, path)
+ os.symlink(filerelpath, targetpath)
+ except:
+ os.symlink(filepath_obj.resolve(), targetpath)
+ elif soft_link == 2:
shutil.move(filepath, targetpath)
# 移走文件后,在原来位置增加一个可追溯的软链接,指向文件新位置
# 以便追查文件从原先位置被移动到哪里了,避免因为得到错误番号后改名移动导致的文件失踪
- # 便于手工找回文件。并将软连接文件名后缀修改,以避免再次被搜刮。
+ # 便于手工找回文件。由于目前软链接已经不会被刮削,文件名后缀无需再修改。
targetabspath = os.path.abspath(targetpath)
if targetabspath != os.path.abspath(filepath):
targetrelpath = os.path.relpath(targetabspath, file_parent_origin_path)
- os.symlink(targetrelpath, filepath + '#sym')
- sub_res = conf.sub_rule()
+ os.symlink(targetrelpath, filepath)
+ sub_res = config.getInstance().sub_rule()
for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname))
@@ -422,9 +453,9 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
print('[+]Sub moved!')
return True
- except FileExistsError:
- print('[-]File Exists! Please check your movie!')
- print('[-]move to the root folder of the program.')
+ except FileExistsError as fee:
+ print(f'[-]FileExistsError: {fee}')
+ moveFailedFolder(filepath)
return
except PermissionError:
print('[-]Error! Please run as administrator!')
@@ -434,19 +465,22 @@ def paste_file_to_folder(filepath, path, number, leak_word, c_word, conf: config
return
-def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf): # 文件路径,番号,后缀,要移动至的位置
+def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word): # 文件路径,番号,后缀,要移动至的位置
if multi_part == 1:
number += part # 这时number会被附加上CD1后缀
filepath_obj = pathlib.Path(filepath)
houzhui = filepath_obj.suffix
file_parent_origin_path = str(filepath_obj.parent)
+ targetpath = os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}")
+ if os.path.exists(targetpath):
+ raise FileExistsError('File Exists on destination path, we will never overwriting.')
try:
- if conf.soft_link():
- os.symlink(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}"))
+ if config.getInstance().soft_link():
+ os.symlink(filepath, targetpath)
else:
- shutil.move(filepath, os.path.join(path, f"{number}{part}{leak_word}{c_word}{houzhui}"))
+ shutil.move(filepath, targetpath)
- sub_res = conf.sub_rule()
+ sub_res = config.getInstance().sub_rule()
for subname in sub_res:
sub_filepath = str(filepath_obj.with_suffix(subname))
if os.path.isfile(sub_filepath): # 字幕移动
@@ -454,9 +488,8 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
print('[+]Sub moved!')
print('[!]Success')
return True
- except FileExistsError:
- print('[-]File Exists! Please check your movie!')
- print('[-]move to the root folder of the program.')
+ except FileExistsError as fee:
+ print(f'[-]FileExistsError: {fee}')
return
except PermissionError:
print('[-]Error! Please run as administrator!')
@@ -465,7 +498,7 @@ def paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_wo
print(f'[-]OS Error errno {oserr.errno}')
return
-def get_part(filepath, conf):
+def get_part(filepath):
try:
if re.search('-CD\d+', filepath):
return re.findall('-CD\d+', filepath)[0]
@@ -473,7 +506,7 @@ def get_part(filepath, conf):
return re.findall('-cd\d+', filepath)[0]
except:
print("[-]failed!Please rename the filename again!")
- moveFailedFolder(filepath, conf)
+ moveFailedFolder(filepath)
return
@@ -493,7 +526,8 @@ def debug_print(data: json):
pass
-def core_main(file_path, number_th, conf: config.Config):
+def core_main(file_path, number_th):
+ conf = config.getInstance()
# =======================================================================初始化所需变量
multi_part = 0
part = ''
@@ -507,11 +541,11 @@ def core_main(file_path, number_th, conf: config.Config):
# 下面被注释的变量不需要
#rootpath= os.getcwd
number = number_th
- json_data = get_data_from_json(number, conf) # 定义番号
+ json_data = get_data_from_json(number) # 定义番号
# Return if blank dict returned (data not found)
if not json_data:
- moveFailedFolder(filepath, conf)
+ moveFailedFolder(filepath)
return
if json_data["number"] != number:
@@ -526,16 +560,13 @@ def core_main(file_path, number_th, conf: config.Config):
# =======================================================================判断-C,-CD后缀
if '-CD' in filepath or '-cd' in filepath:
multi_part = 1
- part = get_part(filepath, conf)
+ part = get_part(filepath)
if '-c.' in filepath or '-C.' in filepath or '中文' in filepath or '字幕' in filepath:
cn_sub = '1'
c_word = '-C' # 中文字幕影片后缀
# 判断是否无码
- if is_uncensored(number):
- uncensored = 1
- else:
- uncensored = 0
+ uncensored = 1 if is_uncensored(number) else 0
if '流出' in filepath or 'uncensored' in filepath:
@@ -550,7 +581,7 @@ def core_main(file_path, number_th, conf: config.Config):
debug_print(json_data)
# 创建文件夹
- #path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data, conf)
+ #path = create_folder(rootpath + '/' + conf.success_folder(), json_data.get('location_rule'), json_data)
# main_mode
# 1: 刮削模式 / Scraping mode
@@ -558,54 +589,55 @@ def core_main(file_path, number_th, conf: config.Config):
# 3:不改变路径刮削
if conf.main_mode() == 1:
# 创建文件夹
- path = create_folder(json_data, conf)
+ path = create_folder(json_data)
if multi_part == 1:
number += part # 这时number会被附加上CD1后缀
# 检查小封面, 如果image cut为3,则下载小封面
if imagecut == 3:
- small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath)
+ small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath)
# creatFolder会返回番号路径
- image_download( json_data.get('cover'), number, leak_word, c_word, path, conf, filepath)
+ image_download( json_data.get('cover'), number, leak_word, c_word, path, filepath)
if not multi_part or part.lower() == '-cd1':
try:
# 下载预告片
if conf.is_trailer() and json_data.get('trailer'):
- trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
+ trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
except:
pass
try:
- # 下载剧照 data, path, conf: config.Config, filepath
+ # 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'):
- extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
+ extrafanart_download(json_data.get('extrafanart'), path, filepath)
except:
pass
# 裁剪图
cutImage(imagecut, path, number, leak_word, c_word)
- # 打印文件
- print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored, conf)
-
- # 移动文件
- paste_file_to_folder(filepath, path, number, leak_word, c_word, conf)
-
+ # 添加水印
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark():
- add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
+ add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
+
+ # 移动电影
+ paste_file_to_folder(filepath, path, number, leak_word, c_word)
+
+ # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志
+ print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath, tag, json_data.get('actor_list'), liuchu, uncensored)
elif conf.main_mode() == 2:
# 创建文件夹
- path = create_folder(json_data, conf)
+ path = create_folder(json_data)
# 移动文件
- paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word, conf)
+ paste_file_to_folder_mode2(filepath, path, multi_part, number, part, leak_word, c_word)
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark():
- add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
+ add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
elif conf.main_mode() == 3:
path = str(Path(file_path).parent)
@@ -614,28 +646,29 @@ def core_main(file_path, number_th, conf: config.Config):
# 检查小封面, 如果image cut为3,则下载小封面
if imagecut == 3:
- small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, conf, filepath)
+ small_cover_check(path, number, json_data.get('cover_small'), leak_word, c_word, filepath)
# creatFolder会返回番号路径
- image_download(json_data.get('cover'), number, leak_word, c_word, path, conf, filepath)
+ image_download(json_data.get('cover'), number, leak_word, c_word, path, filepath)
if not multi_part or part.lower() == '-cd1':
# 下载预告片
if conf.is_trailer() and json_data.get('trailer'):
- trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath, conf)
+ trailer_download(json_data.get('trailer'), leak_word, c_word, number, path, filepath)
- # 下载剧照 data, path, conf: config.Config, filepath
+ # 下载剧照 data, path, filepath
if conf.is_extrafanart() and json_data.get('extrafanart'):
- extrafanart_download(json_data.get('extrafanart'), path, conf, filepath)
+ extrafanart_download(json_data.get('extrafanart'), path, filepath)
# 裁剪图
cutImage(imagecut, path, number, leak_word, c_word)
- # 打印文件
- print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
- tag, json_data.get('actor_list'), liuchu, uncensored, conf)
-
+ # 添加水印
poster_path = os.path.join(path, f"{number}{leak_word}{c_word}-poster.jpg")
thumb_path = os.path.join(path, f"{number}{leak_word}{c_word}-thumb.jpg")
if conf.is_watermark():
- add_mark(poster_path, thumb_path, cn_sub, leak, uncensored, conf)
+ add_mark(poster_path, thumb_path, cn_sub, leak, uncensored)
+
+ # 最后输出.nfo元数据文件,以完成.nfo文件创建作为任务成功标志
+ print_files(path, leak_word, c_word, json_data.get('naming_rule'), part, cn_sub, json_data, filepath,
+ tag, json_data.get('actor_list'), liuchu, uncensored)
diff --git a/number_parser.py b/number_parser.py
index 2d1874e1b..4d4fe937a 100755
--- a/number_parser.py
+++ b/number_parser.py
@@ -1,14 +1,14 @@
import os
import re
-from core import *
-
+import sys
+import config
G_spat = re.compile(
"^22-sht\.me|-fhd|_fhd|^fhd_|^fhd-|-hd|_hd|^hd_|^hd-|-sd|_sd|-1080p|_1080p|-720p|_720p|^hhd800\.com@",
re.IGNORECASE)
-def get_number(debug,filepath: str) -> str:
+def get_number(debug,file_path: str) -> str:
# """
# >>> from number_parser import get_number
# >>> get_number("/Users/Guest/AV_Data_Capture/snis-829.mp4")
@@ -32,77 +32,174 @@ def get_number(debug,filepath: str) -> str:
# >>> get_number("snis-829-C.mp4")
# 'snis-829'
# """
- filepath = os.path.basename(filepath)
-
- if debug == False:
- try:
- if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
- #filepath = filepath.replace("_", "-")
- filepath = G_spat.sub("", filepath)
- filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
- lower_check = filename.lower()
- if 'fc2' in lower_check:
- filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
- file_number = get_number_by_dict(lower_check)
- if file_number:
- return file_number
- return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
- else: # 提取不含减号-的番号,FANZA CID
- # 欧美番号匹配规则
- oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
- if oumei:
- return oumei.group()
-
- try:
- return str(
- re.findall(r'(.+?)\.',
- str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
- "['']").replace('_', '-')
- except:
- return re.search(r'(.+?)\.', filepath)[0]
- except Exception as e:
- print('[-]' + str(e))
- return
- elif debug == True:
- if '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
- #filepath = filepath.replace("_", "-")
+ filepath = os.path.basename(file_path)
+ # debug True 和 False 两块代码块合并,原因是此模块及函数只涉及字符串计算,没有IO操作,debug on时输出导致异常信息即可
+ try:
+ file_number = get_number_by_dict(filepath)
+ if file_number:
+ return file_number
+ elif '-' in filepath or '_' in filepath: # 普通提取番号 主要处理包含减号-和_的番号
filepath = G_spat.sub("", filepath)
filename = str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filepath)) # 去除文件名中时间
lower_check = filename.lower()
if 'fc2' in lower_check:
filename = lower_check.replace('ppv', '').replace('--', '-').replace('_', '-').upper()
- file_number = get_number_by_dict(lower_check)
- if file_number:
- return file_number
return str(re.search(r'\w+(-|_)\w+', filename, re.A).group())
else: # 提取不含减号-的番号,FANZA CID
# 欧美番号匹配规则
oumei = re.search(r'[a-zA-Z]+\.\d{2}\.\d{2}\.\d{2}', filepath)
if oumei:
return oumei.group()
-
try:
return str(
re.findall(r'(.+?)\.',
- str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
+ str(re.search('([^<>/\\\\|:""\\*\\?]+)\\.\\w+$', filepath).group()))).strip(
"['']").replace('_', '-')
except:
- return re.search(r'(.+?)\.', filepath)[0]
+ return str(re.search(r'(.+?)\.', filepath)[0])
+ except Exception as e:
+ if debug:
+ print(f'[-]Number Parser exception: {e} [{file_path}]')
+ return None
+
+# 按javdb数据源的命名规范提取number
G_TAKE_NUM_RULES = {
- 'tokyo' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.A).group()),
- 'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('_', '-'),
- '1pon' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.A).group()).replace('-', '_'),
- '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.A).group()).replace('-', '_'),
- 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.A).group())
- }
+ 'tokyo.*hot' : lambda x:str(re.search(r'(cz|gedo|k|n|red-|se)\d{2,4}', x, re.I).group()),
+ 'carib' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('_', '-'),
+ '1pon|mura|paco' : lambda x:str(re.search(r'\d{6}(-|_)\d{3}', x, re.I).group()).replace('-', '_'),
+ '10mu' : lambda x:str(re.search(r'\d{6}(-|_)\d{2}', x, re.I).group()).replace('-', '_'),
+ 'x-art' : lambda x:str(re.search(r'x-art\.\d{2}\.\d{2}\.\d{2}', x, re.I).group()),
+ 'xxx-av': lambda x:''.join(['xxx-av-', re.findall(r'xxx-av[^\d]*(\d{3,5})[^\d]*', x, re.I)[0]]),
+ 'heydouga': lambda x:'heydouga-' + '-'.join(re.findall(r'(\d{4})[\-_](\d{3,4})[^\d]*', x, re.I)[0])
+}
-def get_number_by_dict(lower_filename: str) -> str:
- for k,v in G_TAKE_NUM_RULES.items():
- if k in lower_filename:
- return v(lower_filename)
+def get_number_by_dict(filename: str) -> str:
+ try:
+ for k,v in G_TAKE_NUM_RULES.items():
+ if re.search(k, filename, re.I):
+ return v(filename)
+ except:
+ pass
return None
-# if __name__ == "__main__":
+class Cache_uncensored_conf:
+ prefix = None
+ def is_empty(self):
+ return bool(self.prefix is None)
+ def set(self, v: list):
+ if not v or not len(v) or not len(v[0]):
+ raise ValueError('input prefix list empty or None')
+ s = v[0]
+ if len(v) > 1:
+ for i in v[1:]:
+ s += f"|{i}.+"
+ self.prefix = re.compile(s, re.I)
+ def check(self, number):
+ if self.prefix is None:
+ raise ValueError('No init re compile')
+ return self.prefix.match(number)
+
+G_cache_uncensored_conf = Cache_uncensored_conf()
+
+# ========================================================================是否为无码
+def is_uncensored(number):
+ if re.match(
+r'[\d-]{4,}|\d{6}_\d{2,3}|(cz|gedo|k|n|red-|se)\d{2,4}|heyzo.+|xxx-av-.+|heydouga-.+|x-art\.\d{2}\.\d{2}\.\d{2}',
+ number,
+ re.I
+ ):
+ return True
+ if G_cache_uncensored_conf.is_empty():
+ G_cache_uncensored_conf.set(config.getInstance().get_uncensored().split(','))
+ return G_cache_uncensored_conf.check(number)
+
+if __name__ == "__main__":
# import doctest
# doctest.testmod(raise_on_error=True)
+ test_use_cases = (
+ "Tokyo Hot n9001 FHD.mp4", # 无-号,以前无法正确提取
+ "TokyoHot-n1287-HD SP2006 .mp4",
+ "caribean-020317_001.nfo", # -号误命名为_号的
+ "257138_3xplanet_1Pondo_080521_001.mp4",
+ "ADV-R0624-CD3.wmv", # 多碟影片
+ "XXX-AV 22061-CD5.iso", # 新支持片商格式 xxx-av-22061 命名规则来自javdb数据源
+ "xxx-av 20589.mp4",
+ "Muramura-102114_145-HD.wmv", # 新支持片商格式 102114_145 命名规则来自javdb数据源
+ "heydouga-4102-023-CD2.iso", # 新支持片商格式 heydouga-4102-023 命名规则来自javdb数据源
+ "HeyDOuGa4236-1048 Ai Qiu - .mp4", # heydouga-4236-1048 命名规则来自javdb数据源
+ "pacopacomama-093021_539-FHD.mkv" # 新支持片商格式 093021_539 命名规则来自javdb数据源
+ )
+ def evprint(evstr):
+ code = compile(evstr, "", "eval")
+ print("{1:>20} # '{0}'".format(evstr[18:-2], eval(code)))
+ for t in test_use_cases:
+ evprint(f'get_number(True, "{t}")')
+
+ if len(sys.argv)<=1 or not re.search('^[A-Z]:?', sys.argv[1], re.IGNORECASE):
+ sys.exit(0)
+
+ # 使用Everything的ES命令行工具搜集全盘视频文件名作为用例测试number数据,参数为盘符 A .. Z 或带盘符路径
+ # https://www.voidtools.com/support/everything/command_line_interface/
+ # ES命令行工具需要Everything文件搜索引擎处于运行状态,es.exe单个执行文件需放入PATH路径中。
+ # Everything是免费软件
+ # 示例:
+ # python.exe .\number_parser.py ALL # 从所有磁盘搜索视频
+ # python.exe .\number_parser.py D # 从D盘搜索
+ # python.exe .\number_parser.py D: # 同上
+ # python.exe .\number_parser.py D:\download\JAVs # 搜索D盘的\download\JAVs目录,路径必须带盘符
+ # ==================
+ # Linux/WSL1|2 使用mlocate(Ubuntu/Debian)或plocate(Debian sid)搜集全盘视频文件名作为测试用例number数据
+ # 需安装'sudo apt install mlocate或plocate'并首次运行sudo updatedb建立全盘索引
+ # MAC OS X 使用findutils的glocate,需安装'sudo brew install findutils'并首次运行sudo gupdatedb建立全盘索引
+ # 示例:
+ # python3 ./number_parser.py ALL
+ import subprocess
+ ES_search_path = "ALL disks"
+ if sys.argv[1] == "ALL":
+ if sys.platform == "win32":
+ # ES_prog_path = 'C:/greensoft/es/es.exe'
+ ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
+ ES_cmdline = f'{ES_prog_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;flv;ts;webm;iso;mpg;m4v'
+ out_bytes = subprocess.check_output(ES_cmdline.split(' '))
+ out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
+ out_list = out_text.splitlines()
+ elif sys.platform in ("linux", "darwin"):
+ ES_prog_path = 'locate' if sys.platform == 'linux' else 'glocate'
+ ES_cmdline = r"{} -b -i --regex '\.mp4$|\.avi$|\.rmvb$|\.wmv$|\.mov$|\.mkv$|\.webm$|\.iso$|\.mpg$|\.m4v$'".format(ES_prog_path)
+ out_bytes = subprocess.check_output(ES_cmdline.split(' '))
+ out_text = out_bytes.decode('utf-8')
+ out_list = [ os.path.basename(line) for line in out_text.splitlines()]
+ else:
+ print('[-]Unsupported platform! Please run on OS Windows/Linux/MacOSX. Exit.')
+ sys.exit(1)
+ else: # Windows single disk
+ if sys.platform != "win32":
+ print('[!]Usage: python3 ./number_parser.py ALL')
+ sys.exit(0)
+ # ES_prog_path = 'C:/greensoft/es/es.exe'
+ ES_prog_path = 'es.exe' # es.exe需要放在PATH环境变量的路径之内
+ if os.path.isdir(sys.argv[1]):
+ ES_search_path = sys.argv[1]
+ else:
+ ES_search_path = sys.argv[1][0] + ':/'
+ if not os.path.isdir(ES_search_path):
+ ES_search_path = 'C:/'
+ ES_search_path = os.path.normcase(ES_search_path)
+ ES_cmdline = f'{ES_prog_path} -path {ES_search_path} -name size:gigantic ext:mp4;avi;rmvb;wmv;mov;mkv;webm;iso;mpg;m4v'
+ out_bytes = subprocess.check_output(ES_cmdline.split(' '))
+ out_text = out_bytes.decode('gb18030') # 中文版windows 10 x64默认输出GB18030,此编码为UNICODE方言与UTF-8系全射关系无转码损失
+ out_list = out_text.splitlines()
+ print(f'\n[!]{ES_prog_path} is searching {ES_search_path} for movies as number parser test cases...')
+ print(f'[+]Find {len(out_list)} Movies.')
+ for filename in out_list:
+ try:
+ n = get_number(True, filename)
+ if n:
+ print(' [{0}] {2}# {1}'.format(n, filename, '#无码' if is_uncensored(n) else ''))
+ else:
+ print(f'[-]Number return None. # {filename}')
+ except Exception as e:
+ print(f'[-]Number Parser exception: {e} [{filename}]')
+
+ sys.exit(0)
diff --git a/py_to_exe.ps1 b/py_to_exe.ps1
index 7fc0f803f..77f169aa6 100644
--- a/py_to_exe.ps1
+++ b/py_to_exe.ps1
@@ -3,14 +3,15 @@
$CLOUDSCRAPER_PATH=$(python -c 'import cloudscraper as _; print(_.__path__[0])' | select -Last 1)
-mkdir build
+mkdir build
mkdir __pycache__
pyinstaller --onefile AV_Data_Capture.py `
--hidden-import ADC_function.py `
--hidden-import core.py `
--add-data "$CLOUDSCRAPER_PATH;cloudscraper" `
- --add-data "Img;Img"
+ --add-data "Img;Img" `
+ --add-data "config.ini;." `
rmdir -Recurse -Force build
rmdir -Recurse -Force __pycache__
diff --git a/wrapper/FreeBSD.sh b/wrapper/FreeBSD.sh
index 70f27d7c9..9717ef469 100755
--- a/wrapper/FreeBSD.sh
+++ b/wrapper/FreeBSD.sh
@@ -1,4 +1,8 @@
pkg install python38 py38-requests py38-pip py38-lxml py38-pillow py38-cloudscraper py38-pysocks git zip py38-beautifulsoup448
pip install pyquery pyinstaller
-pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img"
+pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
+ --add-data "$(python3.8 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
+ --add-data "Img:Img" \
+ --add-data "config.ini:." \
+
cp config.ini ./dist
diff --git a/wrapper/Linux.sh b/wrapper/Linux.sh
index 1d05e6a65..63e3b1c19 100755
--- a/wrapper/Linux.sh
+++ b/wrapper/Linux.sh
@@ -12,5 +12,9 @@
#fi
pip3 install -r requirements.txt
pip3 install cloudscraper==1.2.52
-pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" --add-data "Img:Img"
+pyinstaller --onefile AV_Data_Capture.py --hidden-import ADC_function.py --hidden-import core.py \
+ --add-data "$(python3 -c 'import cloudscraper as _; print(_.__path__[0])' | tail -n 1):cloudscraper" \
+ --add-data "Img:Img" \
+ --add-data "config.ini:." \
+
cp config.ini ./dist