From 701cc954cb1e63b0631dad96d0b49fa9c9c364a1 Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 14 Nov 2021 08:52:45 +0800 Subject: [PATCH 1/3] bugfix --- AV_Data_Capture.py | 22 ++++++++++------------ WebCrawler/avsox.py | 4 ++-- WebCrawler/fc2.py | 2 +- WebCrawler/javbus.py | 2 +- WebCrawler/javlib.py | 4 ++-- 5 files changed, 16 insertions(+), 18 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index c15bb485e..231d6832e 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -474,18 +474,16 @@ def main(): check_update(version) # Download Mapping Table, parallel version - down_map_tab = [] - actor_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_actor.xml' - if not actor_xml.exists(): - down_map_tab.append(( - "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml", - actor_xml)) - info_xml = Path.home() / '.local' / 'share' / 'avdc' / 'mapping_info.xml' - if not info_xml.exists(): - down_map_tab.append(( - "https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml", - info_xml)) - res = parallel_download_files(down_map_tab) + user_data_home = Path.home() / '.local' / 'share' / 'avdc' + map_tab = ( + ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml', + user_data_home / 'mapping_actor.xml'), + ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml', + user_data_home / 'mapping_info.xml'), + ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/c_number.json', + user_data_home / 'c_number.json') + ) + res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists())) for i, fp in enumerate(res, start=1): if fp and len(fp): print(f"[+] [{i}/{len(res)}] Mapping Table Downloaded to {fp}") diff --git a/WebCrawler/avsox.py b/WebCrawler/avsox.py index 8b73b83cd..3699aa45a 100644 --- a/WebCrawler/avsox.py +++ b/WebCrawler/avsox.py @@ -57,8 +57,8 @@ def getCover_small(html): result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") return result def getTag(html): - result = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return result + x = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') + return [i.strip() for i in x[2:]] if len(x) > 2 else [] def getSeries(html): try: result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']") diff --git a/WebCrawler/fc2.py b/WebCrawler/fc2.py index 27bc1a075..c559c8db4 100644 --- a/WebCrawler/fc2.py +++ b/WebCrawler/fc2.py @@ -14,7 +14,7 @@ def getTitle_fc2com(htmlcode): #获取厂商 return result def getActor_fc2com(htmlcode): try: - htmtml = etree.fromstring(htmlcode, etree.HTMLParser()) + html = etree.fromstring(htmlcode, etree.HTMLParser()) result = html.xpath('//*[@id="top"]/div[1]/section[1]/div/section/div[2]/ul/li[3]/a/text()')[0] return result except: diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 09dc045c9..413107d3a 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -72,7 +72,7 @@ def getSerise(html): #获取系列 return str(x[0]) if len(x) else '' def getTag(html): # 获取标签 klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return klist + return [v for v in klist[1:]] def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'
[\s\S]*?
\s*?') html = html_pather.search(htmlcode) diff --git a/WebCrawler/javlib.py b/WebCrawler/javlib.py index 7af0c1468..538fc19f3 100644 --- a/WebCrawler/javlib.py +++ b/WebCrawler/javlib.py @@ -34,10 +34,10 @@ def main(number: str): ) soup = BeautifulSoup(result.text, "html.parser") lx = html.fromstring(str(soup)) - + fanhao_pather = re.compile(r'
(.*?)
') fanhao = fanhao_pather.findall(result.text) - + if "/?v=jav" in result.url: dic = { "title": get_title(lx, soup), From b64dc83e2e8ec165ef680f731a5403bf0ebeedfc Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 14 Nov 2021 08:58:27 +0800 Subject: [PATCH 2/3] simp --- WebCrawler/javbus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/WebCrawler/javbus.py b/WebCrawler/javbus.py index 413107d3a..46493da4c 100644 --- a/WebCrawler/javbus.py +++ b/WebCrawler/javbus.py @@ -72,7 +72,7 @@ def getSerise(html): #获取系列 return str(x[0]) if len(x) else '' def getTag(html): # 获取标签 klist = html.xpath('/html/head/meta[@name="keywords"]/@content')[0].split(',') - return [v for v in klist[1:]] + return klist[1:] def getExtrafanart(htmlcode): # 获取剧照 html_pather = re.compile(r'
[\s\S]*?
\s*?') html = html_pather.search(htmlcode) From 8e9ea6d852e305976e469cb603136ac52d91615f Mon Sep 17 00:00:00 2001 From: lededev Date: Sun, 14 Nov 2021 09:24:49 +0800 Subject: [PATCH 3/3] simp2 --- AV_Data_Capture.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 231d6832e..b761435ba 100755 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -474,15 +474,10 @@ def main(): check_update(version) # Download Mapping Table, parallel version - user_data_home = Path.home() / '.local' / 'share' / 'avdc' - map_tab = ( - ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_actor.xml', - user_data_home / 'mapping_actor.xml'), - ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/mapping_info.xml', - user_data_home / 'mapping_info.xml'), - ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/c_number.json', - user_data_home / 'c_number.json') - ) + def fmd(f): + return ('https://raw.githubusercontent.com/yoshiko2/AV_Data_Capture/master/MappingTable/' + f, + Path.home() / '.local' / 'share' / 'avdc' / f) + map_tab = (fmd('mapping_actor.xml'), fmd('mapping_info.xml'), fmd('c_number.json')) res = parallel_download_files(((k, v) for k, v in map_tab if not v.exists())) for i, fp in enumerate(res, start=1): if fp and len(fp):