Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[tver] Add support for TVer #26662

Closed
wants to merge 14 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[tver] Significantly improved extraction logic and added 'creator' an…
…d 'uploader' keys to extraction information
  • Loading branch information
tsukumijima committed Sep 27, 2020
commit bf6f1048b2662dd316321fa54fc9af319bc45e3d
186 changes: 88 additions & 98 deletions youtube_dl/extractor/tver.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,126 +4,116 @@
import re

from .brightcove import BrightcoveNewIE


class TVerIE(BrightcoveNewIE):

_TESTS = [
{
'url': 'https://tver.jp/feature/f0057485', # 'feature'
'md5': '1c1c09662252571992dee0441028b4ec', # MD5 hash of a short video downloaded by running youtube-dl with the --test option
'info_dict': {
'id': 'f0057485', # TVer ID
'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID
'ext': 'mp4',
'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる',
'description': '大和田(香川照之)の不正を糾弾し、子会社へ出向を命じられた半沢直樹(堺雅人)は、東京セントラル証券営業企画部長に。ある日1500億円超の買収案件が舞い込むが…。',
'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4031511847001/37b5f176-3989-48d9-81d1-4688e80c5531/main/1920x1080/34m10s16ms/match/image.jpg',
'duration': 4100.032,
'timestamp': 1600308623,
'upload_date': '20200917',
'uploader_id': '4031511847001',
},
'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos',
},
{
'url': 'https://tver.jp/corner/f0056997', # 'corner'
'md5': 'aac4e681dcdb775fc44497da4f7bdd05', # MD5 hash of a short video downloaded by running youtube-dl with the --test option
'info_dict': {
'id': 'f0056997', # TVer ID
'display_id': 'ref:kanokari_10', # Brightcove ID
'ext': 'mp4',
'title': '彼女、お借りします 第10話「友達の彼女」-トモカノ-',
'description': 'バイトの初任給を何に使おうか考える和也だったが、ふと栗林のことが脳裏をよぎる。最近栗林の様子がおかしいと、木部から話を聞いていたのだ。ボーッとしていたり、女性不信のつぶやきをしているという。和也は意を決して、栗林を呼び出すことに。翌日、栗林が和也を待っていると──「駿君、だよね?」。待ち合わせ場所にやって来たのは、千鶴だった……!',
'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/5102072605001/900216cc-2e97-4c19-93bb-1a531de358d6/main/1920x1080/12m18s37ms/match/image.jpg',
'duration': 1476.075,
'timestamp': 1599554409,
'upload_date': '20200908',
'uploader_id': '5102072605001',
},
'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos',
from .common import InfoExtractor
from ..utils import (
js_to_json,
)


class TVerIE(InfoExtractor):

_TEST = {
'url': 'https://tver.jp/feature/f0057485', # In addition to 'feature', there are also categories such as 'corner' and 'episode'.
'md5': '4ae1bc00e6d55af8f7e2b2c17029f1a3', # MD5 hash of a short video downloaded by running youtube-dl with the --test option
'info_dict': {
'id': 'f0057485', # TVer ID
'display_id': 'ref:hanzawa_naoki---s2----323-001', # Brightcove ID
'ext': 'mp4',
'title': '半沢直樹(新シリーズ) 第1話 子会社VS銀行!飛ばされた半沢の新たな下剋上が始まる',
'description': 'md5:92ce839312ee1e9b162de73fa08b6374',
'thumbnail': r're:https?://.*\.jpg$',
'duration': 4100.032,
'timestamp': 1600308623,
'upload_date': '20200917',
'uploader_id': '4031511847001',
},
{
'url': 'https://tver.jp/episode/76799350', # 'episode'
'md5': 'ad893db02b8a3e949216c463af7ce51e', # MD5 hash of a short video downloaded by running youtube-dl with the --test option
'info_dict': {
'id': '76799350', # TVer ID
'display_id': '2366_2365_4533', # Brightcove ID
'ext': 'mp4',
'title': '港時間 #49 神奈川県/リビエラシーボニアマリーナ 9月18日(金)放送分',
'description': '【毎週金曜 よる12時15分から放送】\n\n日本のヨット文化 を育んできた三浦半島の西海岸、小網代湾にあるリビエラシーボニアマリーナ。昨年から始まったSailGPの日本チームを率いるヨット界のレジェンドに会いました。',
'thumbnail': 'https://cf-images.ap-northeast-1.prod.boltdns.net/v1/jit/4394098883001/904361ca-40d3-4028-8478-8916b9a0ff49/main/1920x1080/58s80ms/match/image.jpg',
'duration': 116.16,
'timestamp': 1600052421,
'upload_date': '20200914',
'uploader_id': '4394098883001',
},
'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos',
},
]
'skip': 'Running from test_download.py doesn\'t seem to be able to handle encrypted HLS videos',
}

IE_NAME = 'TVer'
IE_DESC = 'TVer'

_VALID_URL = r'https?://(?:www\.)?tver\.jp/(corner|episode|feature)/(?P<id>f?[0-9]+)'
_GEO_COUNTRIES = ['JP'] # TVer service is limited to Japan only

BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=ref:%s'
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'

# TODO: FOD対応
def _real_extract(self, url):

# extract video id
video_id = self._match_id(url)

# download webpage
webpage = self._download_webpage(url, video_id)

# extract video information
video_info_csv = self._search_regex(r'addPlayer\((?P<video_info>.*?)\);', webpage, 'video information', flags=re.DOTALL).strip()
video_info_csv = video_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and '
video_info = video_info_csv.split(',')
# extract tver information
tver_info_csv = self._search_regex(r'addPlayer\((?P<tver_info>.*?)\);', webpage, 'tver information', flags=re.DOTALL).strip()
tver_info_csv = tver_info_csv.replace('\t', '').replace('\n', '').replace('\'', '') # remove \t and \n and '
tver_info = tver_info_csv.split(',')

# extract brightcove account id
brightcove_account_id = video_info[3]
# extract brightcove information
brightcove_account_id = tver_info[3]
brightcove_video_id = 'ref:' + tver_info[4]
brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id)
brightcove_info = self._extract_brightcove_info(brightcove_url, 'https://tver.jp/')

# extract brightcove video id
brightcove_video_id = video_info[4]
# extract tver description
description = \
self._html_search_meta(['og:description', 'twitter:description'], webpage, 'description', default=None) or \
self._html_search_regex(r'<div[^>]+class="description"[^>]*>(?P<description>.*?)</div>', webpage, 'description', default=None, flags=re.DOTALL)

# brightcove url
brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % (brightcove_account_id, brightcove_video_id)
# Note: Delegate extraction to BrightcoveNewIE by specifying url_transparent,
# while also making TverIE's own acquired entities such as description available.
info_dict = {
'_type': 'url_transparent',
'url': brightcove_url,
'ie_key': BrightcoveNewIE.ie_key(),
'id': video_id, # Tver ID
'display_id': brightcove_video_id, # Brightcove ID
'title': brightcove_info.get('name'),
'description': description,
'thumbnail': re.sub(r'/[0-9]+x[0-9]+/', r'/1920x1080/', brightcove_info.get('poster')), # select large thumbnail
'creator': tver_info[7], # Broadcaster name e.g. 'tbs', 'ntv'
'uploader': tver_info[8], # Delivery platform name e.g. 'TBS FREE', '日テレ無料'
}

# debug output
if self._downloader.params.get('verbose', False):
self.to_screen('Video Information: %s' % video_info)
self.to_screen('Brightcove Account ID: %s' % brightcove_account_id)
self.to_screen('Brightcove Video ID: %s' % brightcove_video_id)
self.to_screen('Brightcove URL: %s' % brightcove_url)
return info_dict

# evacuate _VALID_URL
_VALID_URL = self._VALID_URL
def _extract_brightcove_info(self, url, referrer):

# temporarily replace _VALID_URL
# prevent _VALID_URL from being the URL of Tver when executing the parent class's _real_extract () method
self._VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'
valid_url = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)'

# get video information
info_dict = super(TVerIE, self)._real_extract(brightcove_url)
account_id, player_id, embed, content_type, video_id = re.match(valid_url, url).groups()

# get video description
description = \
self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage, 'description', default=None) or \
self._html_search_regex(r'<div[^>]+class="description"[^>]*>(?P<description>.*?)</div>', webpage, 'description', default=None, flags=re.DOTALL)
def extract_policy_key():
webpage = self._download_webpage(
'http://players.brightcove.net/%s/%s_%s/index.min.js'
% (account_id, player_id, embed), video_id)

# undo _VALID_URL
self._VALID_URL = _VALID_URL
policy_key = None

# TVer ID
info_dict['id'] = video_id
# Brightcove ID
info_dict['display_id'] = brightcove_video_id
# select large thumbnail
info_dict['thumbnail'] = info_dict.get('thumbnail').replace('160x90', '1920x1080')
# desctiption
info_dict['description'] = description
catalog = self._search_regex(
r'catalog\(({.+?})\);', webpage, 'catalog', default=None)
if catalog:
catalog = self._parse_json(
js_to_json(catalog), video_id, fatal=False)
if catalog:
policy_key = catalog.get('policyKey')

return info_dict
if not policy_key:
policy_key = self._search_regex(
r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1',
webpage, 'policy key', group='pk')

return policy_key

# brightcove api url
api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id)

# set header
headers = {
'Accept': 'application/json;pk=%s' % extract_policy_key(),
'Origin': re.search(r'https?://[^/]+', referrer).group(0),
'Referer': referrer,
}

# return brightcove api info
return self._download_json(api_url, video_id, headers=headers)