Skip to content

Commit

Permalink
Update 3.7-5 DEBUG ONLY
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Aug 14, 2020
1 parent c5a6871 commit e687035
Show file tree
Hide file tree
Showing 14 changed files with 123 additions and 65 deletions.
43 changes: 23 additions & 20 deletions AV_Data_Capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,31 +63,32 @@ def CEF(path):
a = ''


def create_data_and_move(file_path: str, c: config.Config):
def create_data_and_move(file_path: str, c: config.Config,debug):
# Normalized number, eg: 111xxx-222.mp4 -> xxx-222.mp4
n_number = get_number(file_path)

# print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
# core_main(file_path, n_number, c)
# print("[*]======================================================")

try:
if debug == True:
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
core_main(file_path, n_number, c)
print("[*]======================================================")
except Exception as err:
print("[-] [{}] ERROR:".format(file_path))
print('[-]', err)

if c.soft_link():
print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
else:
try:
print("[-]Move [{}] to failed folder".format(file_path))
shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
except Exception as err:
print('[!]', err)
else:
try:
print("[!]Making Data for [{}], the number is [{}]".format(file_path, n_number))
core_main(file_path, n_number, c)
print("[*]======================================================")
except Exception as err:
print("[-] [{}] ERROR:".format(file_path))
print('[-]', err)

if c.soft_link():
print("[-]Link {} to failed folder".format(file_path))
os.symlink(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
else:
try:
print("[-]Move [{}] to failed folder".format(file_path))
shutil.move(file_path, str(os.getcwd()) + "/" + conf.failed_folder() + "/")
except Exception as err:
print('[!]', err)

def create_data_and_move_with_custom_number(file_path: str, c: config.Config, custom_number=None):
try:
Expand Down Expand Up @@ -145,13 +146,15 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu
count = 0
count_all = str(len(movie_list))
print('[+]Find', count_all, 'movies')
if conf.debug() == True:
print('[+]'+' DEBUG MODE ON '.center(54, '-'))
if conf.soft_link():
print('[!] --- Soft link mode is ENABLE! ----')
for movie_path in movie_list: # 遍历电影列表 交给core处理
count = count + 1
percentage = str(count / int(count_all) * 100)[:4] + '%'
print('[!] - ' + percentage + ' [' + str(count) + '/' + count_all + '] -')
create_data_and_move(movie_path, conf)
create_data_and_move(movie_path, conf, conf.debug())

CEF(conf.success_folder())
CEF(conf.failed_folder())
Expand Down
Empty file added WebCrawler/__init__.py
Empty file.
2 changes: 2 additions & 0 deletions WebCrawler/avsox.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json
Expand Down
102 changes: 69 additions & 33 deletions WebCrawler/dlsite.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from lxml import etree
import json
from bs4 import BeautifulSoup
import sys
sys.path.append('../')
from ADC_function import *
# import sys
# import io
Expand All @@ -24,7 +26,10 @@ def getTitle(a):
return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
try:
result1 = html.xpath('//th[contains(text(),"声优")]/../td/a/text()')
except:
result1 = ''
return result1
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',')
Expand All @@ -35,7 +40,13 @@ def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
Expand All @@ -44,7 +55,13 @@ def getRuntime(a):
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def getYear(getRelease):
try:
Expand All @@ -54,12 +71,12 @@ def getYear(getRelease):
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0]
result1 = html.xpath('//th[contains(text(),"贩卖日")]/../td/a/text()')[0]
return result1.replace('年','-').replace('月','-').replace('日','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()')
result = html.xpath('//th[contains(text(),"分类")]/../td/div/a/text()')
return result
except:
return ''
Expand All @@ -85,7 +102,10 @@ def getCover(htmlcode):
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0]
try:
result = html.xpath('//th[contains(text(),"剧情")]/../td/a/text()')[0]
except:
result = ''
return result
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
Expand All @@ -96,36 +116,52 @@ def getOutline(htmlcode):
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
return result1
try:
try:
result = html.xpath('//th[contains(text(),"系列名")]/../td/span[1]/a/text()')[0]
except:
result = html.xpath('//th[contains(text(),"社团名")]/../td/span[1]/a/text()')[0]
except:
result = ''
return result
def main(number):
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html')
try:
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
cookies={'locale': 'zh-cn'})

dic = {
'actor': getActor(htmlcode),
'title': getTitle(htmlcode),
'studio': getStudio(htmlcode),
'outline': getOutline(htmlcode),
'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode),
'release': getRelease(htmlcode),
'number': number,
'cover': 'https:' + getCover(htmlcode),
'cover_small': '',
'imagecut': 0,
'tag': getTag(htmlcode),
'label': getLabel(htmlcode),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
'source': 'dlsite.py',
'series': getSeries(htmlcode),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
dic = {
'actor': getActor(htmlcode),
'title': getTitle(htmlcode),
'studio': getStudio(htmlcode),
'outline': getOutline(htmlcode),
'runtime': '',
'director': getDirector(htmlcode),
'release': getRelease(htmlcode),
'number': number,
'cover': 'https:' + getCover(htmlcode),
'cover_small': '',
'imagecut': 0,
'tag': getTag(htmlcode),
'label': getLabel(htmlcode),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
'source': 'dlsite.py',
'series': getSeries(htmlcode),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js
except:
data = {
"title": "",
}
js = json.dumps(
data, ensure_ascii=False, sort_keys=True, indent=4, separators=(",", ":")
)
return js

# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
print(main('VJ013479'))
print(main('VJ013178'))
2 changes: 2 additions & 0 deletions WebCrawler/fanza.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import sys
sys.path.append('../')
import json
import re
from urllib.parse import urlencode
Expand Down
2 changes: 2 additions & 0 deletions WebCrawler/fc2fans_club.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree#need install
import json
Expand Down
2 changes: 2 additions & 0 deletions WebCrawler/jav321.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import json
from bs4 import BeautifulSoup
from lxml import html
Expand Down
5 changes: 4 additions & 1 deletion WebCrawler/javbus.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import sys
sys.path.append('../')
import re
from pyquery import PyQuery as pq#need install
from lxml import etree#need install
from bs4 import BeautifulSoup#need install
import json
from ADC_function import *
import fanza
from WebCrawler import fanza


def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml')
Expand Down
2 changes: 2 additions & 0 deletions WebCrawler/javdb.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json
Expand Down
2 changes: 2 additions & 0 deletions WebCrawler/javlib.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import json
import bs4
from bs4 import BeautifulSoup
Expand Down
2 changes: 2 additions & 0 deletions WebCrawler/mgstage.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json
Expand Down
2 changes: 2 additions & 0 deletions WebCrawler/xcity.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import sys
sys.path.append('../')
import re
from lxml import etree
import json
Expand Down
2 changes: 1 addition & 1 deletion config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,4 @@ literals=\()/
folders=failed,JAV_output

[debug_mode]
switch=0
switch=1
20 changes: 10 additions & 10 deletions core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@
from ADC_function import *

# =========website========
import avsox
import fanza
import fc2fans_club
import jav321
import javbus
import javdb
import mgstage
import xcity
import javlib
import dlsite
from WebCrawler import avsox
from WebCrawler import fanza
from WebCrawler import fc2fans_club
from WebCrawler import jav321
from WebCrawler import javbus
from WebCrawler import javdb
from WebCrawler import mgstage
from WebCrawler import xcity
from WebCrawler import javlib
from WebCrawler import dlsite


def escape_path(path, escape_literals: str): # Remove escape literals
Expand Down

0 comments on commit e687035

Please sign in to comment.