forked from mvdctop/Movie_Data_Capture
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
236 additions
and
114 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import re | ||
from lxml import etree | ||
import json | ||
from bs4 import BeautifulSoup | ||
from ADC_function import * | ||
|
||
def getActorPhoto(htmlcode): #//*[@id="star_qdt"]/li/a/img | ||
soup = BeautifulSoup(htmlcode, 'lxml') | ||
a = soup.find_all(attrs={'class': 'avatar-box'}) | ||
d = {} | ||
for i in a: | ||
l = i.img['src'] | ||
t = i.span.get_text() | ||
p2 = {t: l} | ||
d.update(p2) | ||
return d | ||
def getTitle(a): | ||
try: | ||
html = etree.fromstring(a, etree.HTMLParser()) | ||
result = str(html.xpath('/html/body/div[2]/h3/text()')).strip(" ['']") #[0] | ||
return result.replace('/', '') | ||
except: | ||
return '' | ||
def getActor(a): #//*[@id="center_column"]/div[2]/div[1]/div/table/tbody/tr[1]/td/text() | ||
soup = BeautifulSoup(a, 'lxml') | ||
a = soup.find_all(attrs={'class': 'avatar-box'}) | ||
d = [] | ||
for i in a: | ||
d.append(i.span.get_text()) | ||
return d | ||
def getStudio(a): | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//p[contains(text(),"制作商: ")]/following-sibling::p[1]/a/text()')).strip(" ['']").replace("', '",' ') | ||
return result1 | ||
def getRuntime(a): | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//span[contains(text(),"长度:")]/../text()')).strip(" ['分钟']") | ||
return result1 | ||
def getLabel(a): | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//p[contains(text(),"系列:")]/following-sibling::p[1]/a/text()')).strip(" ['']") | ||
return result1 | ||
def getNum(a): | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//span[contains(text(),"识别码:")]/../span[2]/text()')).strip(" ['']") | ||
return result1 | ||
def getYear(release): | ||
try: | ||
result = str(re.search('\d{4}',release).group()) | ||
return result | ||
except: | ||
return release | ||
def getRelease(a): | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//span[contains(text(),"发行时间:")]/../text()')).strip(" ['']") | ||
return result1 | ||
def getCover(htmlcode): | ||
html = etree.fromstring(htmlcode, etree.HTMLParser()) | ||
result = str(html.xpath('/html/body/div[2]/div[1]/div[1]/a/img/@src')).strip(" ['']") | ||
return result | ||
def getCover_small(htmlcode): | ||
html = etree.fromstring(htmlcode, etree.HTMLParser()) | ||
result = str(html.xpath('//*[@id="waterfall"]/div/a/div[1]/img/@src')).strip(" ['']") | ||
return result | ||
def getTag(a): # 获取演员 | ||
soup = BeautifulSoup(a, 'lxml') | ||
a = soup.find_all(attrs={'class': 'genre'}) | ||
d = [] | ||
for i in a: | ||
d.append(i.get_text()) | ||
return d | ||
|
||
def main(number): | ||
a = get_html('https://avsox.asia/cn/search/' + number) | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") | ||
if result1 == '' or result1 == 'null' or result1 == 'None': | ||
a = get_html('https://avsox.asia/cn/search/' + number.replace('-', '_')) | ||
print(a) | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") | ||
if result1 == '' or result1 == 'null' or result1 == 'None': | ||
a = get_html('https://avsox.asia/cn/search/' + number.replace('_', '')) | ||
print(a) | ||
html = etree.fromstring(a, etree.HTMLParser()) # //table/tr[1]/td[1]/text() | ||
result1 = str(html.xpath('//*[@id="waterfall"]/div/a/@href')).strip(" ['']") | ||
web = get_html(result1) | ||
soup = BeautifulSoup(web, 'lxml') | ||
info = str(soup.find(attrs={'class': 'row movie'})) | ||
dic = { | ||
'actor': getActor(web), | ||
'title': getTitle(web).strip(getNum(web)), | ||
'studio': getStudio(info), | ||
'outline': '',# | ||
'runtime': getRuntime(info), | ||
'director': '', # | ||
'release': getRelease(info), | ||
'number': getNum(info), | ||
'cover': getCover(web), | ||
'cover_small': getCover_small(a), | ||
'imagecut': 3, | ||
'tag': getTag(web), | ||
'label': getLabel(info), | ||
'year': getYear(getRelease(info)), # str(re.search('\d{4}',getRelease(a)).group()), | ||
'actor_photo': getActorPhoto(web), | ||
'website': result1, | ||
'source': 'avsox.py', | ||
} | ||
js = json.dumps(dic, ensure_ascii=False, sort_keys=True, indent=4, separators=(',', ':'), ) # .encode('UTF-8') | ||
return js | ||
|
||
#print(main('041516_541')) |
Oops, something went wrong.