diff --git a/ADC_function.py b/ADC_function.py new file mode 100644 index 000000000..b44416afb --- /dev/null +++ b/ADC_function.py @@ -0,0 +1,10 @@ +import requests + +def get_html(url):#网页请求核心 + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + getweb = requests.get(str(url),timeout=5,headers=headers) + getweb.encoding='utf-8' + try: + return getweb.text + except: + print("[-]Connect Failed! Please check your Proxy.") \ No newline at end of file diff --git a/AV_Data_Capture.py b/AV_Data_Capture.py index 2a8a91be3..6f8177975 100644 --- a/AV_Data_Capture.py +++ b/AV_Data_Capture.py @@ -1,6 +1,7 @@ import glob import os import time +import re def movie_lists(): #MP4 @@ -47,8 +48,8 @@ def rreplace(self, old, new, *max): os.chdir(os.getcwd()) for i in movie_lists(): #遍历电影列表 交给core处理 if '_' in i: - os.rename(i, rreplace(i,'_','-',1)) - i = rreplace(i,'_','-',1) + os.rename(re.search(r'[^\\/:*?"<>|\r\n]+$', i).group(), rreplace(re.search(r'[^\\/:*?"<>|\r\n]+$', i).group(), '_', '-', 1)) + i = rreplace(re.search(r'[^\\/:*?"<>|\r\n]+$', i).group(), '_', '-', 1) os.system('python core.py' + ' "' + i + '"') #选择从py文件启动 (用于源码py) #os.system('core.exe' + ' "' + i + '"') #选择从exe文件启动(用于EXE版程序) print("[*]=====================================") @@ -56,5 +57,4 @@ def rreplace(self, old, new, *max): print("[!]Cleaning empty folders") CEF('JAV_output') print("[+]All finished!!!") - time.sleep(3) - + time.sleep(3) \ No newline at end of file diff --git a/core.py b/core.py index 16d7160e4..a7bf42098 100644 --- a/core.py +++ b/core.py @@ -1,68 +1,27 @@ import re -import requests #need install -from pyquery import PyQuery as pq#need install -from lxml import etree#need install import os import os.path import shutil -from bs4 import BeautifulSoup#need install -from PIL import Image#need install +from PIL import Image import time +import javbus +import json +import fc2fans_club +import siro -#=====================爬虫核心部分========================== -def get_html(url):#网页请求核心 - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} - getweb = requests.get(str(url),timeout=5,headers=headers).text - try: - return getweb - except: - print("[-]Connect Failed! Please check your Proxy.") +#初始化全局变量 +title='' +studio='' +year='' +outline='' +runtime='' +director='' +actor='' +release='' +number='' +cover='' +imagecut='' -def getTitle(htmlcode): #获取标题 - doc = pq(htmlcode) - title=str(doc('div.container h3').text()).replace(' ','-') - return title -def getStudio(htmlcode): #获取厂商 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") - return result -def getYear(htmlcode): #获取年份 - html = etree.fromstring(htmlcode,etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - result2 = str(re.search('\d{4}', result).group(0)) - return result2 -def getCover(htmlcode): #获取封面链接 - doc = pq(htmlcode) - image = doc('a.bigImage') - return image.attr('href') - print(image.attr('href')) -def getRelease(htmlcode): #获取出版日期 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") - return result -def getRuntime(htmlcode): #获取分钟 - soup = BeautifulSoup(htmlcode, 'lxml') - a = soup.find(text=re.compile('分鐘')) - return a -def getActor(htmlcode): #获取女优 - b=[] - soup=BeautifulSoup(htmlcode,'lxml') - a=soup.find_all(attrs={'class':'star-name'}) - for i in a: - b.append(i.text) - return ",".join(b) -def getNum(htmlcode): #获取番号 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") - return result -def getDirector(htmlcode): #获取导演 - html = etree.fromstring(htmlcode, etree.HTMLParser()) - result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") - return result -def getOutline(htmlcode): #获取演员 - doc = pq(htmlcode) - result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) - return result #=====================资源下载部分=========================== def DownloadFileWithFilename(url,filename,path): #path = examle:photo , video.in the Project Folder! import requests @@ -81,36 +40,36 @@ def DownloadFileWithFilename(url,filename,path): #path = examle:photo , video.in print("[-]Download Failed2!") time.sleep(3) os._exit(0) -def PrintFiles(html,path,number): +def PrintFiles(path): try: if not os.path.exists(path): os.makedirs(path) - with open(path + "/" + getNum(html) + ".nfo", "wt", encoding='UTF-8') as code: + with open(path + "/" + number + ".nfo", "wt", encoding='UTF-8') as code: print("", file=code) - print(" " + getTitle(html) + "", file=code) + print(" " + title + "", file=code) print(" ", file=code) print(" ", file=code) - print(" " + getStudio(html) + "+", file=code) - print(" " + getYear(html) + "", file=code) - print(" "+getOutline(get_html_dww(number))+"", file=code) - print(" "+getOutline(get_html_dww(number))+"", file=code) - print(" "+str(getRuntime(html)).replace(" ","")+"", file=code) - print(" " + getDirector(html) + "", file=code) - print(" " + getNum(html) + ".png", file=code) - print(" " + getNum(html) + ".png", file=code) - print(" "+getNum(html) + '.jpg'+"", file=code) + print(" " + studio + "+", file=code) + print(" " + year + "", file=code) + print(" "+outline+"", file=code) + print(" "+outline+"", file=code) + print(" "+str(runtime).replace(" ","")+"", file=code) + print(" " + director + "", file=code) + print(" " + number + ".png", file=code) + print(" " + number + ".png", file=code) + print(" "+number + '.jpg'+"", file=code) print(" ", file=code) - print(" " + getActor(html) + "", file=code) + print(" " + actor + "", file=code) print(" ", file=code) - print(" " + getStudio(html) + "", file=code) + print(" " + studio + "", file=code) print(" ", file=code) - print(" " + getNum(html) + "", file=code) - print(" " + getRelease(html) + "", file=code) - print(" "+getCover(html)+"", file=code) - print(" " + "https://www.javbus.com/"+getNum(html) + "", file=code) + print(" " + number + "", file=code) + print(" " + release + "", file=code) + print(" "+cover+"", file=code) + print(" " + "https://www.javbus.com/"+number + "", file=code) print("", file=code) - print("[+]Writeed! "+path + "/" + getNum(html) + ".nfo") + print("[+]Writeed! "+path + "/" + number + ".nfo") except IOError as e: print("[-]Write Failed!") print(e) @@ -119,8 +78,6 @@ def PrintFiles(html,path,number): print("[-]Write Failed!") #=====================本地文件处理=========================== - - def argparse_get_file(): import argparse parser = argparse.ArgumentParser() @@ -128,16 +85,60 @@ def argparse_get_file(): args = parser.parse_args() return args.file def getNumberFromFilename(filepath): + global title + global studio + global year + global outline + global runtime + global director + global actor + global release + global number + global cover + global imagecut + filename = str(os.path.basename(filepath)) #电影文件名 str(re.sub("\[\d{4}-\d{1,2}-\d{1,2}\] - ", "", filename)) - print("[!]Making Data for ["+filename+"]") + a = str(re.search('\w+-\w+', filename).group()) + #print(a) + + # =======================网站规则添加============================== try: - a = str(re.search('\w+-\w+', filename).group()) - return a + try: + if re.search('^\d{5,}', a).group() in filename: + json_data = json.loads(javbus.main_uncensored(a.replace("-", "_"))) + except: + if 'fc2' in filename: + json_data = json.loads(fc2fans_club.main(a)) + elif 'FC2' in filename: + json_data = json.loads(fc2fans_club.main(a)) + elif 'siro' in filename: + json_data = json.loads(siro.main(a)) + elif 'SIRO' in filename: + json_data = json.loads(siro.main(a)) + elif '259luxu' in filename: + json_data = json.loads(siro.main(a)) + elif '259LUXU' in filename: + json_data = json.loads(siro.main(a)) + else: + json_data = json.loads(javbus.main(a)) + # ====================网站规则添加结束============================== + + title = json_data['title'] + studio = json_data['studio'] + year = json_data['year'] + outline = json_data['outline'] + runtime = json_data['runtime'] + director = json_data['director'] + actor = json_data['actor'] + release = json_data['release'] + number = json_data['number'] + cover = json_data['cover'] + imagecut = json_data['imagecut'] except: print('[-]File '+filename+'`s number can not be caught') - print('[-]Move ' + filename + 'to failed folder') + print('[-]Move ' + filename + ' to failed folder') if not os.path.exists('failed/'): # 新建failed文件夹 os.makedirs('failed/') if not os.path.exists('failed/'): @@ -146,52 +147,56 @@ def getNumberFromFilename(filepath): os._exit(0) shutil.move(filepath, str(os.getcwd())+'/'+'failed/') os._exit(0) -def get_html_javbus(number): - return get_html("https://www.javbus.com/" + str(number)) -def get_html_dww(number): - return get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) + path = '' #设置path为全局变量,后面移动文件要用 -def creatFolder(html,number): + +def creatFolder(): global path if not os.path.exists('failed/'): #新建failed文件夹 os.makedirs('failed/') if not os.path.exists('failed/'): print("[-]failed!Dirs can not be make (Please run as Administrator)") os._exit(0) - if len(getActor(html)) > 240: #新建成功输出文件夹 + if len(actor) > 240: #新建成功输出文件夹 path = 'JAV_output' + '/' + '超多人' + '/' + number #path为影片+元数据所在目录 else: - path = 'JAV_output' + '/' + getActor(html) + '/' + number + path = 'JAV_output' + '/' + str(actor) + '/' + str(number) if not os.path.exists(path): os.makedirs(path) path = str(os.getcwd())+'/'+path -def imageDownload(htmlcode,filepath,number): #封面是否下载成功,否则移动到failed - if DownloadFileWithFilename(getCover(htmlcode),number + '.jpg', path) == 'failed': +def imageDownload(filepath): #封面是否下载成功,否则移动到failed + if DownloadFileWithFilename(cover,str(number) + '.jpg', path) == 'failed': shutil.move(filepath, 'failed/') os._exit(0) - DownloadFileWithFilename(getCover(htmlcode), number + '.jpg', path) + DownloadFileWithFilename(cover, number + '.jpg', path) print('[+]Image Downloaded!', path +'/'+number+'.jpg') -def cutImage(number): - try: +def cutImage(): + if imagecut == 1: + try: + img = Image.open(path + '/' + number + '.jpg') + imgSize = img.size + w = img.width + h = img.height + img2 = img.crop((w / 1.9, 0, w, h)) + img2.save(path + '/' + number + '.png') + except: + print('[-]Cover cut failed!') + else: img = Image.open(path + '/' + number + '.jpg') - imgSize=img.size - w=img.width - h=img.height - img2 = img.crop((w/1.9, 0, w, h)) - img2.save(path + '/' + number + '.png') - except: - print('[-]Cover cut failed!') -def pasteFileToFolder(filepath, number, path): #文件路径,番号,后缀,要移动至的位置 + w = img.width + h = img.height + img.save(path + '/' + number + '.png') + +def pasteFileToFolder(filepath, path): #文件路径,番号,后缀,要移动至的位置 houzhui = str(re.search('[.](AVI|RMVB|WMV|MOV|MP4|MKV|FLV|avi|rmvb|wmv|mov|mp4|mkv|flv)$', filepath).group()) os.rename(filepath, number + houzhui) shutil.move(number + houzhui, path) if __name__ == '__main__': filepath=argparse_get_file() #影片的路径 - number=getNumberFromFilename(filepath) #定义番号 - htmlcode=get_html_javbus(number) #获取的HTML代码 - creatFolder(htmlcode,number) #创建文件夹 - imageDownload(htmlcode,filepath,number) #creatFoder会返回番号路径 - PrintFiles(htmlcode, path,number)#打印文件 - cutImage(number) #裁剪图 - pasteFileToFolder(filepath,number,path) #移动文件 + getNumberFromFilename(filepath) #定义番号 + creatFolder() #创建文件夹 + imageDownload(filepath) #creatFoder会返回番号路径 + PrintFiles(path)#打印文件 + cutImage() #裁剪图 + pasteFileToFolder(filepath,path) #移动文件 diff --git a/fc2fans_club.py b/fc2fans_club.py new file mode 100644 index 000000000..bb2cda6d7 --- /dev/null +++ b/fc2fans_club.py @@ -0,0 +1,52 @@ +import re +from lxml import etree#need install +import json +import ADC_function + +def getTitle(htmlcode): #获取厂商 + #print(htmlcode) + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h3/text()')).strip(" ['']") + return result +def getStudio(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[2]/div/div[1]/h5[3]/a[1]/text()')).strip(" ['']") + return result +def getNum(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + return result +def getRelease(number): + a=ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id='+str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-")+'&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') + html=etree.fromstring(a,etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[2]/dl/dd[4]/text()')).strip(" ['']") + return result +def getCover(htmlcode,number): #获取厂商 + a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[1]/div/div[1]/a/img/@src')).strip(" ['']") + return 'http:'+result +def getOutline(htmlcode,number): #获取番号 + a = ADC_function.get_html('http://adult.contents.fc2.com/article_search.php?id=' + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + '&utm_source=aff_php&utm_medium=source_code&utm_campaign=from_aff_php') + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('//*[@id="container"]/div[1]/div/article/section[4]/p/text()')).replace("\\n",'',10000).strip(" ['']").replace("'",'',10000) + return result + +def main(number): + str(number).lstrip("FC2-").lstrip("fc2-").lstrip("fc2_").lstrip("fc2-") + htmlcode = ADC_function.get_html('http://fc2fans.club/html/FC2-' + number + '.html') + dic = { + 'title': getTitle(htmlcode), + 'studio': getStudio(htmlcode), + 'year': getRelease(number), + 'outline': getOutline(htmlcode,number), + 'runtime': '', + 'director': getStudio(htmlcode), + 'actor': '', + 'release': getRelease(number), + 'number': number, + 'cover': getCover(htmlcode,number), + 'imagecut': 0, + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') + return js \ No newline at end of file diff --git a/javbus.py b/javbus.py new file mode 100644 index 000000000..7d20586e3 --- /dev/null +++ b/javbus.py @@ -0,0 +1,129 @@ +import re +import requests #need install +from pyquery import PyQuery as pq#need install +from lxml import etree#need install +import os +import os.path +import shutil +from bs4 import BeautifulSoup#need install +from PIL import Image#need install +import time +import json + +def get_html(url):#网页请求核心 + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + getweb = requests.get(str(url),timeout=5,headers=headers).text + try: + return getweb + except: + print("[-]Connect Failed! Please check your Proxy.") + +def getTitle(htmlcode): #获取标题 + doc = pq(htmlcode) + title=str(doc('div.container h3').text()).replace(' ','-') + return title +def getStudio(htmlcode): #获取厂商 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[5]/a/text()')).strip(" ['']") + return result +def getYear(htmlcode): #获取年份 + html = etree.fromstring(htmlcode,etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") + result2 = str(re.search('\d{4}', result).group(0)) + return result2 +def getCover(htmlcode): #获取封面链接 + doc = pq(htmlcode) + image = doc('a.bigImage') + return image.attr('href') + print(image.attr('href')) +def getRelease(htmlcode): #获取出版日期 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[2]/text()')).strip(" ['']") + return result +def getRuntime(htmlcode): #获取分钟 + soup = BeautifulSoup(htmlcode, 'lxml') + a = soup.find(text=re.compile('分鐘')) + return a +def getActor(htmlcode): #获取女优 + b=[] + soup=BeautifulSoup(htmlcode,'lxml') + a=soup.find_all(attrs={'class':'star-name'}) + for i in a: + b.append(i.text) + return ",".join(b) +def getNum(htmlcode): #获取番号 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[1]/span[2]/text()')).strip(" ['']") + return result +def getDirector(htmlcode): #获取导演 + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('/html/body/div[5]/div[1]/div[2]/p[4]/a/text()')).strip(" ['']") + return result +def getOutline(htmlcode): #获取演员 + doc = pq(htmlcode) + result = str(doc('tr td div.mg-b20.lh4 p.mg-b20').text()) + return result + +def main(number): + htmlcode=get_html('https://www.javbus.com/'+number) + dww_htmlcode=get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) + dic = { + 'title': getTitle(htmlcode), + 'studio': getStudio(htmlcode), + 'year': getYear(htmlcode), + 'outline': getOutline(dww_htmlcode), + 'runtime': getRuntime(htmlcode), + 'director': getDirector(htmlcode), + 'actor': getActor(htmlcode), + 'release': getRelease(htmlcode), + 'number': getNum(htmlcode), + 'cover': getCover(htmlcode), + 'imagecut': 1, + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') + return js + +def main_uncensored(number): + htmlcode=get_html('https://www.javbus.com/'+number) + dww_htmlcode=get_html("https://www.dmm.co.jp/mono/dvd/-/detail/=/cid=" + number.replace("-", '')) + dic = { + 'title': getTitle(htmlcode), + 'studio': getStudio(htmlcode), + 'year': getYear(htmlcode), + 'outline': getOutline(dww_htmlcode), + 'runtime': getRuntime(htmlcode), + 'director': getDirector(htmlcode), + 'actor': getActor(htmlcode), + 'release': getRelease(htmlcode), + 'number': getNum(htmlcode), + 'cover': getCover(htmlcode), + 'imagecut': 0, + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') + return js + +# def return1(): +# json_data=json.loads(main('ipx-292')) +# +# title = str(json_data['title']) +# studio = str(json_data['studio']) +# year = str(json_data['year']) +# outline = str(json_data['outline']) +# runtime = str(json_data['runtime']) +# director = str(json_data['director']) +# actor = str(json_data['actor']) +# release = str(json_data['release']) +# number = str(json_data['number']) +# cover = str(json_data['cover']) +# +# # print(title) +# # print(studio) +# # print(year) +# # print(outline) +# # print(runtime) +# # print(director) +# # print(actor) +# # print(release) +# # print(number) +# # print(cover) +# return1() \ No newline at end of file diff --git a/py to exe.bat b/py to exe.bat new file mode 100644 index 000000000..ec61457e7 --- /dev/null +++ b/py to exe.bat @@ -0,0 +1,2 @@ +pyinstaller --onefile AV_Data_Capture.py +pyinstaller --onefile core.py --hidden-import ADC_function.py --hidden-import fc2fans_club.py --hidden-import javbus.py --hidden-import siro.py \ No newline at end of file diff --git a/siro.py b/siro.py new file mode 100644 index 000000000..f5d46811e --- /dev/null +++ b/siro.py @@ -0,0 +1,76 @@ +import re +from lxml import etree +import json +import requests +from bs4 import BeautifulSoup + +def get_html(url):#网页请求核心 + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'} + cookies = {'adc':'1'} + getweb = requests.get(str(url),timeout=5,cookies=cookies,headers=headers).text + try: + return getweb + except: + print("[-]Connect Failed! Please check your Proxy.") + +def getTitle(a): + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('//*[@id="center_column"]/div[2]/h1/text()')).strip(" ['']") + return result +def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() + html = etree.fromstring(a, etree.HTMLParser()) + result=str(html.xpath('//table[2]/tr[1]/td/a/text()')).strip(" ['\\n ']") + return result +def getStudio(a): + html = etree.fromstring(a, etree.HTMLParser()) + result=str(html.xpath('//table[2]/tr[2]/td/a/text()')).strip(" ['\\n ']") + return result +def getRuntime(a): + html = etree.fromstring(a, etree.HTMLParser()) + result=str(html.xpath('//table[2]/tr[3]/td/text()')).strip(" ['\\n ']") + return result +def getNum(a): + html = etree.fromstring(a, etree.HTMLParser()) + result=str(html.xpath('//table[2]/tr[4]/td/text()')).strip(" ['\\n ']") + return result +def getYear(a): + html = etree.fromstring(a, etree.HTMLParser()) + #result=str(html.xpath('//table[2]/tr[5]/td/text()')).strip(" ['\\n ']") + result=str(html.xpath('//table[2]/tr[5]/td/text()')).strip(" ['\\n ']") + return result +def getRelease(a): + html = etree.fromstring(a, etree.HTMLParser()) + result=str(html.xpath('//table[2]/tr[5]/td/text()')).strip(" ['\\n ']") + return result +def getCover(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="center_column"]/div[2]/div[1]/div/div/h2/img/@src')).strip(" ['']") + return result +def getDirector(a): + html = etree.fromstring(a, etree.HTMLParser()) + result = str(html.xpath('//table[2]/tr[7]/td/a/text()')).strip(" ['\\n ']") + return result +def getOutline(htmlcode): + html = etree.fromstring(htmlcode, etree.HTMLParser()) + result = str(html.xpath('//*[@id="introduction"]/dd/p[1]/text()')).strip(" ['']") + return result + +def main(number): + htmlcode=get_html('https://www.mgstage.com/product/product_detail/'+str(number)) + soup = BeautifulSoup(htmlcode, 'lxml') + a = str(soup.find(attrs={'class': 'detail_data'})) + dic = { + 'title': getTitle(htmlcode).replace("\\n",'').replace(' ',''), + 'studio': getStudio(a), + 'year': getYear(a), + 'outline': getOutline(htmlcode), + 'runtime': getRuntime(a), + 'director': getDirector(a), + 'actor': getActor(a), + 'release': getRelease(a), + 'number': number, + 'cover': getCover(htmlcode), + 'imagecut': 0, + } + js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'),)#.encode('UTF-8') + return js \ No newline at end of file