Skip to content

Commit

Permalink
Update Pre-release 3.7
Browse files Browse the repository at this point in the history
  • Loading branch information
root committed Aug 12, 2020
1 parent e7a7e17 commit 72a9790
Show file tree
Hide file tree
Showing 12 changed files with 1,623 additions and 2 deletions.
4 changes: 2 additions & 2 deletions AV_Data_Capture.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def movie_lists(root, escape_folder):
if folder in root:
return []
total = []
file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', ]
file_type = ['.mp4', '.avi', '.rmvb', '.wmv', '.mov', '.mkv', '.flv', '.ts', '.webm', '.MP4', '.AVI', '.RMVB', '.WMV','.MOV', '.MKV', '.FLV', '.TS', '.WEBM', '.iso','.ISO']
dirs = os.listdir(root)
for entry in dirs:
f = os.path.join(root, entry)
Expand Down Expand Up @@ -110,7 +110,7 @@ def create_data_and_move_with_custom_number(file_path: str, c: config.Config, cu


if __name__ == '__main__':
version = '3.6'
version = '3.7'

# Parse command line args
single_file_path, config_file, auto_exit, custom_number = argparse_function()
Expand Down
124 changes: 124 additions & 0 deletions WebCrawler/avsox.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)

def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img
soup = BeautifulSoup(htmlcode, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'})
d = {}
for i in a:
l = i.img['src']
t = i.span.get_text()
p2 = {t: l}
d.update(p2)
return d
def getTitle(a):
try:
html = etree.fromstring(a, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0]
return result.replace('/', '')
except:
return ''
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'avatar-box'})
d = []
for i in a:
d.append(i.span.get_text())
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ')
return result1
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']")
return result1
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']")
return result1
def getNum(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']")
return result1
def getYear(release):
try:
result = str(re.search('\d{4}',release).group())
return result
except:
return release
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']")
return result1
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']")
return result
def getCover_small(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']")
return result
def getTag(a): # 获取演员
soup = BeautifulSoup(a, 'lxml')
a = soup.find_all(attrs={'class': 'genre'})
d = []
for i in a:
d.append(i.get_text())
return d
def getSeries(htmlcode):
try:
html = etree.fromstring(htmlcode, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//span[contains(text(),"系列:")]/../span[2]/text()')).strip(" ['']")
return result1
except:
return ''

def main(number):
a = get_html('https://avsox.host/cn/search/' + number)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html('https://avsox.host/cn/search/' + number.replace('-', '_'))
print(a)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
if result1 == '' or result1 == 'null' or result1 == 'None':
a = get_html('https://avsox.host/cn/search/' + number.replace('_', ''))
print(a)
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']")
web = get_html(result1)
soup = BeautifulSoup(web, 'lxml')
info = str(soup.find(attrs={'class': 'row movie'}))
dic = {
'actor': getActor(web),
'title': getTitle(web).strip(getNum(web)),
'studio': getStudio(info),
'outline': '',#
'runtime': getRuntime(info),
'director': '', #
'release': getRelease(info),
'number': getNum(info),
'cover': getCover(web),
'cover_small': getCover_small(a),
'imagecut': 3,
'tag': getTag(web),
'label': getLabel(info),
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': getActorPhoto(web),
'website': result1,
'source': 'avsox.py',
'series': getSeries(info),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js

if __name__ == "__main__":
print(main('012717_472'))
131 changes: 131 additions & 0 deletions WebCrawler/dlsite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import re
from lxml import etree
import json
from bs4 import BeautifulSoup
from ADC_function import *
# import sys
# import io
# sys.stdout = io.TextIOWrapper(sys.stdout.buffer, errors = 'replace', line_buffering = True)
#print(get_html('https://www.dlsite.com/pro/work/=/product_id/VJ013152.html'))
#title //*[@id="work_name"]/a/text()
#studio //th[contains(text(),"ブランド名")]/../td/span[1]/a/text()
#release //th[contains(text(),"販売日")]/../td/a/text()
#story //th[contains(text(),"シナリオ")]/../td/a/text()
#senyo //th[contains(text(),"声優")]/../td/a/text()
#tag //th[contains(text(),"ジャンル")]/../td/div/a/text()
#jianjie //*[@id="main_inner"]/div[3]/text()
#photo //*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src

#https://www.dlsite.com/pro/work/=/product_id/VJ013152.html

def getTitle(a):
html = etree.fromstring(a, etree.HTMLParser())
result = html.xpath('//*[@id="work_name"]/a/text()')[0]
return result
def getActor(a): # //*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text()
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
return result1
def getActorPhoto(actor): #//*[@id="star_qdt"]/li/a/img
a = actor.split(',')
d={}
for i in a:
p={i:''}
d.update(p)
return d
def getStudio(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
return result
def getRuntime(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = str(html.xpath('//strong[contains(text(),"時長")]/../span/text()')).strip(" ['']")
result2 = str(html.xpath('//strong[contains(text(),"時長")]/../span/a/text()')).strip(" ['']")
return str(result1 + result2).strip('+').rstrip('mi')
def getLabel(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"ブランド名")]/../td/span[1]/a/text()')[0]
return result
def getYear(getRelease):
try:
result = str(re.search('\d{4}', getRelease).group())
return result
except:
return getRelease
def getRelease(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"販売日")]/../td/a/text()')[0]
return result1.replace('年','-').replace('月','-').replace('日','')
def getTag(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath('//th[contains(text(),"ジャンル")]/../td/div/a/text()')
return result
except:
return ''

def getCover_small(a, index=0):
# same issue mentioned below,
# javdb sometime returns multiple results
# DO NOT just get the firt one, get the one with correct index number
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
try:
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@src")[index]
if not 'https' in result:
result = 'https:' + result
return result
except: # 2020.7.17 Repair Cover Url crawl
result = html.xpath("//div[@class='item-image fix-scale-cover']/img/@data-src")[index]
if not 'https' in result:
result = 'https:' + result
return result
def getCover(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
result = html.xpath('//*[@id="work_left"]/div/div/div[2]/div/div[1]/div[1]/ul/li/img/@src')[0]
return result
def getDirector(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result = html.xpath('//th[contains(text(),"シナリオ")]/../td/a/text()')[0]
return result
def getOutline(htmlcode):
html = etree.fromstring(htmlcode, etree.HTMLParser())
total = []
result = html.xpath('//*[@id="main_inner"]/div[3]/text()')
for i in result:
total.append(i.strip('\r\n'))
return str(total).strip(" ['']").replace("', '', '",r'\n').replace("', '",r'\n').strip(", '', '")
def getSeries(a):
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text()
result1 = html.xpath('//th[contains(text(),"声優")]/../td/a/text()')
return result1
def main(number):
number = number.upper()
htmlcode = get_html('https://www.dlsite.com/pro/work/=/product_id/' + number + '.html')

dic = {
'actor': getActor(htmlcode),
'title': getTitle(htmlcode),
'studio': getStudio(htmlcode),
'outline': getOutline(htmlcode),
'runtime': getRuntime(htmlcode),
'director': getDirector(htmlcode),
'release': getRelease(htmlcode),
'number': number,
'cover': 'https:' + getCover(htmlcode),
'cover_small': '',
'imagecut': 0,
'tag': getTag(htmlcode),
'label': getLabel(htmlcode),
'year': getYear(getRelease(htmlcode)), # str(re.search('\d{4}',getRelease(a)).group()),
'actor_photo': '',
'website': 'https://www.dlsite.com/pro/work/=/product_id/' + number + '.html',
'source': 'dlsite.py',
'series': getSeries(htmlcode),
}
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8')
return js

# main('DV-1562')
# input("[+][+]Press enter key exit, you can check the error messge before you exit.\n[+][+]按回车键结束,你可以在结束之前查看和错误信息。")
if __name__ == "__main__":
print(main('VJ013479'))
Loading

0 comments on commit 72a9790

Please sign in to comment.